update libyuv to r1456
picks up build warning fixes for visual studio 2015 Change-Id: Idea85fa70d1aeb2a46ea355b87fe41ec5b2b9520
This commit is contained in:
parent
f42012e526
commit
fcb4253c9c
@ -22,17 +22,18 @@ LIBYUV_SRCS += third_party/libyuv/include/libyuv/basic_types.h \
|
||||
third_party/libyuv/source/planar_functions.cc \
|
||||
third_party/libyuv/source/row_any.cc \
|
||||
third_party/libyuv/source/row_common.cc \
|
||||
third_party/libyuv/source/row_gcc.cc \
|
||||
third_party/libyuv/source/row_mips.cc \
|
||||
third_party/libyuv/source/row_neon.cc \
|
||||
third_party/libyuv/source/row_neon64.cc \
|
||||
third_party/libyuv/source/row_posix.cc \
|
||||
third_party/libyuv/source/row_win.cc \
|
||||
third_party/libyuv/source/scale.cc \
|
||||
third_party/libyuv/source/scale_any.cc \
|
||||
third_party/libyuv/source/scale_common.cc \
|
||||
third_party/libyuv/source/scale_gcc.cc \
|
||||
third_party/libyuv/source/scale_mips.cc \
|
||||
third_party/libyuv/source/scale_neon.cc \
|
||||
third_party/libyuv/source/scale_neon64.cc \
|
||||
third_party/libyuv/source/scale_posix.cc \
|
||||
third_party/libyuv/source/scale_win.cc \
|
||||
|
||||
LIBWEBM_MUXER_SRCS += third_party/libwebm/mkvmuxer.cpp \
|
||||
|
3
third_party/libyuv/README.libvpx
vendored
3
third_party/libyuv/README.libvpx
vendored
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: http://code.google.com/p/libyuv/
|
||||
Version: 1305
|
||||
Version: 1456
|
||||
License: BSD
|
||||
License File: LICENSE
|
||||
|
||||
@ -13,4 +13,3 @@ which down-samples the original input video (f.g. 1280x720) a number of times
|
||||
in order to encode multiple resolution bit streams.
|
||||
|
||||
Local Modifications:
|
||||
cherry pick r1311 'disable nv12 avx2 for vs9/10 that dont support avx2 instructions.'
|
||||
|
2
third_party/libyuv/include/libyuv/convert.h
vendored
2
third_party/libyuv/include/libyuv/convert.h
vendored
@ -71,6 +71,8 @@ int I400ToI420(const uint8* src_y, int src_stride_y,
|
||||
uint8* dst_v, int dst_stride_v,
|
||||
int width, int height);
|
||||
|
||||
#define J400ToJ420 I400ToI420
|
||||
|
||||
// Convert NV12 to I420.
|
||||
LIBYUV_API
|
||||
int NV12ToI420(const uint8* src_y, int src_stride_y,
|
||||
|
16
third_party/libyuv/include/libyuv/convert_argb.h
vendored
16
third_party/libyuv/include/libyuv/convert_argb.h
vendored
@ -68,20 +68,20 @@ int I411ToARGB(const uint8* src_y, int src_stride_y,
|
||||
uint8* dst_argb, int dst_stride_argb,
|
||||
int width, int height);
|
||||
|
||||
// Convert I400 (grey) to ARGB.
|
||||
// Convert I400 (grey) to ARGB. Reverse of ARGBToI400.
|
||||
LIBYUV_API
|
||||
int I400ToARGB(const uint8* src_y, int src_stride_y,
|
||||
uint8* dst_argb, int dst_stride_argb,
|
||||
int width, int height);
|
||||
|
||||
// Alias.
|
||||
#define YToARGB I400ToARGB_Reference
|
||||
|
||||
// Convert I400 to ARGB. Reverse of ARGBToI400.
|
||||
// Convert J400 (jpeg grey) to ARGB.
|
||||
LIBYUV_API
|
||||
int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
|
||||
uint8* dst_argb, int dst_stride_argb,
|
||||
int width, int height);
|
||||
int J400ToARGB(const uint8* src_y, int src_stride_y,
|
||||
uint8* dst_argb, int dst_stride_argb,
|
||||
int width, int height);
|
||||
|
||||
// Alias.
|
||||
#define YToARGB I400ToARGB
|
||||
|
||||
// Convert NV12 to ARGB.
|
||||
LIBYUV_API
|
||||
|
11
third_party/libyuv/include/libyuv/convert_from.h
vendored
11
third_party/libyuv/include/libyuv/convert_from.h
vendored
@ -137,6 +137,17 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y,
|
||||
uint8* dst_frame, int dst_stride_frame,
|
||||
int width, int height);
|
||||
|
||||
// Convert I420 To RGB565 with 4x4 dither matrix (16 bytes).
|
||||
// Values in dither matrix from 0 to 7 recommended.
|
||||
// The order of the dither matrix is first byte is upper left.
|
||||
|
||||
LIBYUV_API
|
||||
int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
|
||||
const uint8* src_u, int src_stride_u,
|
||||
const uint8* src_v, int src_stride_v,
|
||||
uint8* dst_frame, int dst_stride_frame,
|
||||
const uint8* dither4x4, int width, int height);
|
||||
|
||||
LIBYUV_API
|
||||
int I420ToARGB1555(const uint8* src_y, int src_stride_y,
|
||||
const uint8* src_u, int src_stride_u,
|
||||
|
@ -61,12 +61,15 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
|
||||
uint8* dst_rgb565, int dst_stride_rgb565,
|
||||
int width, int height);
|
||||
|
||||
// Convert ARGB To RGB565 with 8x8 dither matrix (64 bytes).
|
||||
// Values in dither matrix from 0 to 255. 128 is best for no dither.
|
||||
// Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes).
|
||||
// Values in dither matrix from 0 to 7 recommended.
|
||||
// The order of the dither matrix is first byte is upper left.
|
||||
// TODO(fbarchard): Consider pointer to 2d array for dither4x4.
|
||||
// const uint8(*dither)[4][4];
|
||||
LIBYUV_API
|
||||
int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
|
||||
uint8* dst_rgb565, int dst_stride_rgb565,
|
||||
const uint8* dither8x8, int width, int height);
|
||||
const uint8* dither4x4, int width, int height);
|
||||
|
||||
// Convert ARGB To ARGB1555.
|
||||
LIBYUV_API
|
||||
@ -140,6 +143,12 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
|
||||
uint8* dst_y, int dst_stride_y,
|
||||
int width, int height);
|
||||
|
||||
// Convert ARGB to G. (Reverse of J400toARGB, which replicates G back to ARGB)
|
||||
LIBYUV_API
|
||||
int ARGBToG(const uint8* src_argb, int src_stride_argb,
|
||||
uint8* dst_g, int dst_stride_g,
|
||||
int width, int height);
|
||||
|
||||
// Convert ARGB To NV12.
|
||||
LIBYUV_API
|
||||
int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
|
||||
|
@ -45,6 +45,7 @@ int I400ToI400(const uint8* src_y, int src_stride_y,
|
||||
uint8* dst_y, int dst_stride_y,
|
||||
int width, int height);
|
||||
|
||||
#define J400ToJ400 I400ToI400
|
||||
|
||||
// Copy I422 to I422.
|
||||
#define I422ToI422 I422Copy
|
||||
@ -84,6 +85,18 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
|
||||
uint8* dst_v, int dst_stride_v,
|
||||
int width, int height);
|
||||
|
||||
LIBYUV_API
|
||||
int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2,
|
||||
uint8* dst_y, int dst_stride_y,
|
||||
uint8* dst_uv, int dst_stride_uv,
|
||||
int width, int height);
|
||||
|
||||
LIBYUV_API
|
||||
int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy,
|
||||
uint8* dst_y, int dst_stride_y,
|
||||
uint8* dst_uv, int dst_stride_uv,
|
||||
int width, int height);
|
||||
|
||||
// Convert I420 to I400. (calls CopyPlane ignoring u/v).
|
||||
LIBYUV_API
|
||||
int I420ToI400(const uint8* src_y, int src_stride_y,
|
||||
@ -93,6 +106,7 @@ int I420ToI400(const uint8* src_y, int src_stride_y,
|
||||
int width, int height);
|
||||
|
||||
// Alias
|
||||
#define J420ToJ400 I420ToI400
|
||||
#define I420ToI420Mirror I420Mirror
|
||||
|
||||
// I420 mirror.
|
||||
@ -387,24 +401,24 @@ int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
|
||||
uint8* dst_argb, int dst_stride_argb,
|
||||
int width, int height, int interpolation);
|
||||
|
||||
#if defined(__pnacl__) || defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \
|
||||
defined(TARGET_IPHONE_SIMULATOR)
|
||||
#if defined(__pnacl__) || defined(__CLR_VER) || \
|
||||
(defined(__i386__) && !defined(__SSE2__))
|
||||
#define LIBYUV_DISABLE_X86
|
||||
#endif
|
||||
// The following are available on all x86 platforms:
|
||||
#if !defined(LIBYUV_DISABLE_X86) && \
|
||||
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
|
||||
#define HAS_ARGBAFFINEROW_SSE2
|
||||
#endif
|
||||
|
||||
// Row functions for copying a pixels from a source with a slope to a row
|
||||
// Row function for copying pixels from a source with a slope to a row
|
||||
// of destination. Useful for scaling, rotation, mirror, texture mapping.
|
||||
LIBYUV_API
|
||||
void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
|
||||
uint8* dst_argb, const float* uv_dudv, int width);
|
||||
// The following are available on all x86 platforms:
|
||||
#if !defined(LIBYUV_DISABLE_X86) && \
|
||||
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
|
||||
LIBYUV_API
|
||||
void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
|
||||
uint8* dst_argb, const float* uv_dudv, int width);
|
||||
#define HAS_ARGBAFFINEROW_SSE2
|
||||
#endif // LIBYUV_DISABLE_X86
|
||||
|
||||
// Shuffle ARGB channel order. e.g. BGRA to ARGB.
|
||||
// shuffler is 16 bytes and must be aligned.
|
||||
|
138
third_party/libyuv/include/libyuv/rotate_row.h
vendored
Normal file
138
third_party/libyuv/include/libyuv/rotate_row.h
vendored
Normal file
@ -0,0 +1,138 @@
|
||||
/*
|
||||
* Copyright 2013 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#ifndef INCLUDE_LIBYUV_ROTATE_ROW_H_ // NOLINT
|
||||
#define INCLUDE_LIBYUV_ROTATE_ROW_H_
|
||||
|
||||
#include "libyuv/basic_types.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#if defined(__pnacl__) || defined(__CLR_VER) || \
|
||||
(defined(__i386__) && !defined(__SSE2__))
|
||||
#define LIBYUV_DISABLE_X86
|
||||
#endif
|
||||
|
||||
// Visual C 2012 required for AVX2.
|
||||
#if defined(_M_IX86) && !defined(__clang__) && \
|
||||
defined(_MSC_VER) && _MSC_VER >= 1700
|
||||
#define VISUALC_HAS_AVX2 1
|
||||
#endif // VisualStudio >= 2012
|
||||
|
||||
// TODO(fbarchard): switch to standard form of inline; fails on clangcl.
|
||||
#if !defined(LIBYUV_DISABLE_X86) && \
|
||||
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
|
||||
#if defined(__APPLE__) && defined(__i386__)
|
||||
#define DECLARE_FUNCTION(name) \
|
||||
".text \n" \
|
||||
".private_extern _" #name " \n" \
|
||||
".align 4,0x90 \n" \
|
||||
"_" #name ": \n"
|
||||
#elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__)
|
||||
#define DECLARE_FUNCTION(name) \
|
||||
".text \n" \
|
||||
".align 4,0x90 \n" \
|
||||
"_" #name ": \n"
|
||||
#else
|
||||
#define DECLARE_FUNCTION(name) \
|
||||
".text \n" \
|
||||
".align 4,0x90 \n" \
|
||||
#name ": \n"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// The following are available for Visual C:
|
||||
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
|
||||
defined(_MSC_VER) && !defined(__clang__)
|
||||
#define HAS_TRANSPOSEWX8_SSSE3
|
||||
#define HAS_TRANSPOSEUVWX8_SSE2
|
||||
#endif
|
||||
|
||||
// The following are available for GCC but not NaCL:
|
||||
#if !defined(LIBYUV_DISABLE_X86) && \
|
||||
(defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
|
||||
#define HAS_TRANSPOSEWX8_SSSE3
|
||||
#endif
|
||||
|
||||
// The following are available for 32 bit GCC:
|
||||
#if !defined(LIBYUV_DISABLE_X86) && defined(__i386__) && !defined(__clang__)
|
||||
#define HAS_TRANSPOSEUVWX8_SSE2
|
||||
#endif
|
||||
|
||||
// The following are available for 64 bit GCC but not NaCL:
|
||||
#if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
|
||||
defined(__x86_64__)
|
||||
#define HAS_TRANSPOSEWX8_FAST_SSSE3
|
||||
#define HAS_TRANSPOSEUVWX8_SSE2
|
||||
#endif
|
||||
|
||||
#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
|
||||
(defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
|
||||
#define HAS_TRANSPOSEWX8_NEON
|
||||
#define HAS_TRANSPOSEUVWX8_NEON
|
||||
#endif
|
||||
|
||||
#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
|
||||
defined(__mips__) && \
|
||||
defined(__mips_dsp) && (__mips_dsp_rev >= 2)
|
||||
#define HAS_TRANSPOSEWX8_MIPS_DSPR2
|
||||
#define HAS_TRANSPOSEUVWx8_MIPS_DSPR2
|
||||
#endif // defined(__mips__)
|
||||
|
||||
void TransposeWxH_C(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride, int width, int height);
|
||||
|
||||
void TransposeWx8_C(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride, int width);
|
||||
void TransposeWx8_NEON(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride, int width);
|
||||
void TransposeWx8_SSSE3(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride, int width);
|
||||
void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride, int width);
|
||||
void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride, int width);
|
||||
|
||||
void TransposeWx8_Any_NEON(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride, int width);
|
||||
void TransposeWx8_Any_SSSE3(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride, int width);
|
||||
void TransposeWx8_Fast_Any_SSSE3(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride, int width);
|
||||
void TransposeWx8_Any_MIPS_DSPR2(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride, int width);
|
||||
|
||||
void TransposeUVWxH_C(const uint8* src, int src_stride,
|
||||
uint8* dst_a, int dst_stride_a,
|
||||
uint8* dst_b, int dst_stride_b,
|
||||
int width, int height);
|
||||
|
||||
void TransposeUVWx8_C(const uint8* src, int src_stride,
|
||||
uint8* dst_a, int dst_stride_a,
|
||||
uint8* dst_b, int dst_stride_b, int width);
|
||||
void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
|
||||
uint8* dst_a, int dst_stride_a,
|
||||
uint8* dst_b, int dst_stride_b, int width);
|
||||
void TransposeUVWx8_NEON(const uint8* src, int src_stride,
|
||||
uint8* dst_a, int dst_stride_a,
|
||||
uint8* dst_b, int dst_stride_b, int width);
|
||||
void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
|
||||
uint8* dst_a, int dst_stride_a,
|
||||
uint8* dst_b, int dst_stride_b, int width);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
||||
|
||||
#endif // INCLUDE_LIBYUV_ROTATE_ROW_H_ NOLINT
|
241
third_party/libyuv/include/libyuv/row.h
vendored
241
third_party/libyuv/include/libyuv/row.h
vendored
@ -37,10 +37,8 @@ extern "C" {
|
||||
free(var##_mem); \
|
||||
var = 0
|
||||
|
||||
#if defined(__pnacl__) || defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \
|
||||
defined(TARGET_IPHONE_SIMULATOR) || \
|
||||
(defined(__i386__) && !defined(__SSE2__)) || \
|
||||
(defined(_MSC_VER) && defined(__clang__))
|
||||
#if defined(__pnacl__) || defined(__CLR_VER) || \
|
||||
(defined(__i386__) && !defined(__SSE2__))
|
||||
#define LIBYUV_DISABLE_X86
|
||||
#endif
|
||||
// True if compiling for SSSE3 as a requirement.
|
||||
@ -48,6 +46,9 @@ extern "C" {
|
||||
#define LIBYUV_SSSE3_ONLY
|
||||
#endif
|
||||
|
||||
#if defined(__native_client__)
|
||||
#define LIBYUV_DISABLE_NEON
|
||||
#endif
|
||||
// clang >= 3.5.0 required for Arm64.
|
||||
#if defined(__clang__) && defined(__aarch64__) && !defined(LIBYUV_DISABLE_NEON)
|
||||
#if (__clang_major__ < 3) || (__clang_major__ == 3 && (__clang_minor__ < 5))
|
||||
@ -63,11 +64,11 @@ extern "C" {
|
||||
#define HAS_ABGRTOYROW_SSSE3
|
||||
#define HAS_ARGB1555TOARGBROW_SSE2
|
||||
#define HAS_ARGB4444TOARGBROW_SSE2
|
||||
#define HAS_ARGBSETROW_X86
|
||||
#define HAS_ARGBSHUFFLEROW_SSE2
|
||||
#define HAS_ARGBSHUFFLEROW_SSSE3
|
||||
#define HAS_ARGBTOARGB1555ROW_SSE2
|
||||
#define HAS_ARGBTOARGB4444ROW_SSE2
|
||||
#define HAS_ARGBTOBAYERGGROW_SSE2
|
||||
#define HAS_ARGBTORAWROW_SSSE3
|
||||
#define HAS_ARGBTORGB24ROW_SSSE3
|
||||
#define HAS_ARGBTORGB565ROW_SSE2
|
||||
@ -95,7 +96,8 @@ extern "C" {
|
||||
#define HAS_I422TOUYVYROW_SSE2
|
||||
#define HAS_I422TOYUY2ROW_SSE2
|
||||
#define HAS_I444TOARGBROW_SSSE3
|
||||
// #define HAS_J422TOARGBROW_SSSE3
|
||||
#define HAS_J400TOARGBROW_SSE2
|
||||
#define HAS_J422TOARGBROW_SSSE3
|
||||
#define HAS_MERGEUVROW_SSE2
|
||||
#define HAS_MIRRORROW_SSE2
|
||||
#define HAS_MIRRORROW_SSSE3
|
||||
@ -112,15 +114,13 @@ extern "C" {
|
||||
#define HAS_RGB565TOARGBROW_SSE2
|
||||
#define HAS_RGBATOUVROW_SSSE3
|
||||
#define HAS_RGBATOYROW_SSSE3
|
||||
#define HAS_SETROW_X86
|
||||
#define HAS_SETROW_ERMS
|
||||
#define HAS_ARGBSETROW_X86
|
||||
#define HAS_SETROW_X86
|
||||
#define HAS_SPLITUVROW_SSE2
|
||||
#define HAS_UYVYTOARGBROW_SSSE3
|
||||
#define HAS_UYVYTOUV422ROW_SSE2
|
||||
#define HAS_UYVYTOUVROW_SSE2
|
||||
#define HAS_UYVYTOYROW_SSE2
|
||||
#define HAS_YTOARGBROW_SSE2
|
||||
#define HAS_YUY2TOARGBROW_SSSE3
|
||||
#define HAS_YUY2TOUV422ROW_SSE2
|
||||
#define HAS_YUY2TOUVROW_SSE2
|
||||
@ -157,8 +157,9 @@ extern "C" {
|
||||
#define HAS_SOBELYROW_SSE2
|
||||
#endif
|
||||
|
||||
// The following are available on x64 Visual C:
|
||||
#if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64)
|
||||
// The following are available on x64 Visual C and clangcl.
|
||||
#if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64) && \
|
||||
(!defined(__clang__) || defined(__SSSE3__))
|
||||
#define HAS_I422TOARGBROW_SSSE3
|
||||
#endif
|
||||
|
||||
@ -177,27 +178,31 @@ extern "C" {
|
||||
#endif // __clang__
|
||||
|
||||
// Visual C 2012 required for AVX2.
|
||||
#if defined(_M_IX86) && defined(_MSC_VER) && _MSC_VER >= 1700
|
||||
#if defined(_M_IX86) && !defined(__clang__) && \
|
||||
defined(_MSC_VER) && _MSC_VER >= 1700
|
||||
#define VISUALC_HAS_AVX2 1
|
||||
#endif // VisualStudio >= 2012
|
||||
|
||||
// The following are available require VS2012. Port to GCC.
|
||||
#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2)
|
||||
// TODO(fbarchard): fix AVX2 versions of YUV conversion. bug=393
|
||||
#define HAS_I422TOABGRROW_AVX2
|
||||
#define HAS_I422TOARGBROW_AVX2
|
||||
#define HAS_I422TOBGRAROW_AVX2
|
||||
#define HAS_I422TORGBAROW_AVX2
|
||||
#define HAS_NV12TOARGBROW_AVX2
|
||||
#define HAS_NV21TOARGBROW_AVX2
|
||||
#define HAS_ARGBTORGB565ROW_AVX2
|
||||
#define HAS_ARGB1555TOARGBROW_AVX2
|
||||
#define HAS_ARGB4444TOARGBROW_AVX2
|
||||
#define HAS_ARGBTOARGB1555ROW_AVX2
|
||||
#define HAS_ARGBTOARGB4444ROW_AVX2
|
||||
#define HAS_NV12TORGB565ROW_AVX2
|
||||
#define HAS_NV21TORGB565ROW_AVX2
|
||||
#define HAS_I422TORGB565ROW_AVX2
|
||||
#define HAS_ARGBTORGB565DITHERROW_AVX2
|
||||
#define HAS_ARGBTORGB565DITHERROW_SSE2
|
||||
#define HAS_ARGBTORGB565ROW_AVX2
|
||||
#define HAS_I411TOARGBROW_AVX2
|
||||
#define HAS_I422TOARGB1555ROW_AVX2
|
||||
#define HAS_I422TOARGB4444ROW_AVX2
|
||||
#define HAS_I422TORGB565ROW_AVX2
|
||||
#define HAS_I444TOARGBROW_AVX2
|
||||
#define HAS_J400TOARGBROW_AVX2
|
||||
#define HAS_NV12TOARGBROW_AVX2
|
||||
#define HAS_NV12TORGB565ROW_AVX2
|
||||
#define HAS_NV21TOARGBROW_AVX2
|
||||
#define HAS_NV21TORGB565ROW_AVX2
|
||||
#define HAS_RGB565TOARGBROW_AVX2
|
||||
#endif
|
||||
|
||||
// The following are available on all x86 platforms, but
|
||||
@ -214,24 +219,27 @@ extern "C" {
|
||||
#define HAS_ARGBTOYJROW_AVX2
|
||||
#define HAS_ARGBTOYROW_AVX2
|
||||
#define HAS_COPYROW_AVX
|
||||
#define HAS_I400TOARGBROW_AVX2
|
||||
#define HAS_I422TOABGRROW_AVX2
|
||||
#define HAS_I422TOARGBROW_AVX2
|
||||
#define HAS_I422TOBGRAROW_AVX2
|
||||
#define HAS_I422TORAWROW_AVX2
|
||||
#define HAS_I422TORGB24ROW_AVX2
|
||||
#define HAS_I422TORGBAROW_AVX2
|
||||
#define HAS_INTERPOLATEROW_AVX2
|
||||
#define HAS_J422TOARGBROW_AVX2
|
||||
#define HAS_MERGEUVROW_AVX2
|
||||
#define HAS_MIRRORROW_AVX2
|
||||
#define HAS_SPLITUVROW_AVX2
|
||||
#define HAS_UYVYTOARGBROW_AVX2
|
||||
#define HAS_UYVYTOUV422ROW_AVX2
|
||||
#define HAS_UYVYTOUVROW_AVX2
|
||||
#define HAS_UYVYTOYROW_AVX2
|
||||
#define HAS_YTOARGBROW_AVX2
|
||||
#define HAS_YUY2TOARGBROW_AVX2
|
||||
#define HAS_YUY2TOUV422ROW_AVX2
|
||||
#define HAS_YUY2TOUVROW_AVX2
|
||||
#define HAS_YUY2TOYROW_AVX2
|
||||
|
||||
// The following require HAS_I422TOARGBROW_AVX2
|
||||
#if defined(HAS_I422TOARGBROW_AVX2)
|
||||
#define HAS_YUY2TOARGBROW_AVX2
|
||||
#define HAS_UYVYTOARGBROW_AVX2
|
||||
#endif
|
||||
|
||||
// Effects:
|
||||
#define HAS_ARGBADDROW_AVX2
|
||||
#define HAS_ARGBATTENUATEROW_AVX2
|
||||
@ -240,22 +248,6 @@ extern "C" {
|
||||
#define HAS_ARGBUNATTENUATEROW_AVX2
|
||||
#endif
|
||||
|
||||
|
||||
// The following are Yasm x86 only:
|
||||
// TODO(fbarchard): Port AVX2 to inline.
|
||||
#if !defined(LIBYUV_DISABLE_X86) && defined(HAVE_YASM)
|
||||
(defined(_M_IX86) || defined(_M_X64) || \
|
||||
defined(__x86_64__) || defined(__i386__))
|
||||
#define HAS_MERGEUVROW_AVX2
|
||||
#define HAS_MERGEUVROW_MMX
|
||||
#define HAS_SPLITUVROW_AVX2
|
||||
#define HAS_SPLITUVROW_MMX
|
||||
#define HAS_UYVYTOYROW_AVX2
|
||||
#define HAS_UYVYTOYROW_MMX
|
||||
#define HAS_YUY2TOYROW_AVX2
|
||||
#define HAS_YUY2TOYROW_MMX
|
||||
#endif
|
||||
|
||||
// The following are disabled when SSSE3 is available:
|
||||
#if !defined(LIBYUV_DISABLE_X86) && \
|
||||
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
|
||||
@ -278,7 +270,6 @@ extern "C" {
|
||||
#define HAS_ARGB4444TOYROW_NEON
|
||||
#define HAS_ARGBTOARGB1555ROW_NEON
|
||||
#define HAS_ARGBTOARGB4444ROW_NEON
|
||||
#define HAS_ARGBTOBAYERGGROW_NEON
|
||||
#define HAS_ARGBTORAWROW_NEON
|
||||
#define HAS_ARGBTORGB24ROW_NEON
|
||||
#define HAS_ARGBTORGB565ROW_NEON
|
||||
@ -292,7 +283,7 @@ extern "C" {
|
||||
#define HAS_BGRATOUVROW_NEON
|
||||
#define HAS_BGRATOYROW_NEON
|
||||
#define HAS_COPYROW_NEON
|
||||
#define HAS_I400TOARGBROW_NEON
|
||||
#define HAS_J400TOARGBROW_NEON
|
||||
#define HAS_I411TOARGBROW_NEON
|
||||
#define HAS_I422TOABGRROW_NEON
|
||||
#define HAS_I422TOARGB1555ROW_NEON
|
||||
@ -331,11 +322,12 @@ extern "C" {
|
||||
#define HAS_UYVYTOUV422ROW_NEON
|
||||
#define HAS_UYVYTOUVROW_NEON
|
||||
#define HAS_UYVYTOYROW_NEON
|
||||
#define HAS_YTOARGBROW_NEON
|
||||
#define HAS_I400TOARGBROW_NEON
|
||||
#define HAS_YUY2TOARGBROW_NEON
|
||||
#define HAS_YUY2TOUV422ROW_NEON
|
||||
#define HAS_YUY2TOUVROW_NEON
|
||||
#define HAS_YUY2TOYROW_NEON
|
||||
#define HAS_ARGBTORGB565DITHERROW_NEON
|
||||
|
||||
// Effects:
|
||||
#define HAS_ARGBADDROW_NEON
|
||||
@ -388,7 +380,6 @@ typedef __declspec(align(32)) int8 lvec8[32];
|
||||
typedef __declspec(align(32)) uint16 ulvec16[16];
|
||||
typedef __declspec(align(32)) uint32 ulvec32[8];
|
||||
typedef __declspec(align(32)) uint8 ulvec8[32];
|
||||
|
||||
#elif defined(__GNUC__)
|
||||
// Caveat GCC 4.2 to 4.7 have a known issue using vectors with const.
|
||||
#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
|
||||
@ -869,6 +860,11 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
|
||||
int pix);
|
||||
void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
|
||||
int pix);
|
||||
void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, int pix);
|
||||
void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
|
||||
int pix);
|
||||
void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,
|
||||
int pix);
|
||||
|
||||
void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix);
|
||||
void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix);
|
||||
@ -884,12 +880,20 @@ void ARGB1555ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix);
|
||||
void ARGB4444ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix);
|
||||
void RGB24ToARGBRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix);
|
||||
void RAWToARGBRow_Any_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix);
|
||||
|
||||
void RGB565ToARGBRow_Any_SSE2(const uint8* src_rgb565, uint8* dst_argb,
|
||||
int pix);
|
||||
void ARGB1555ToARGBRow_Any_SSE2(const uint8* src_argb1555, uint8* dst_argb,
|
||||
int pix);
|
||||
void ARGB4444ToARGBRow_Any_SSE2(const uint8* src_argb4444, uint8* dst_argb,
|
||||
int pix);
|
||||
void RGB565ToARGBRow_Any_AVX2(const uint8* src_rgb565, uint8* dst_argb,
|
||||
int pix);
|
||||
void ARGB1555ToARGBRow_Any_AVX2(const uint8* src_argb1555, uint8* dst_argb,
|
||||
int pix);
|
||||
void ARGB4444ToARGBRow_Any_AVX2(const uint8* src_argb4444, uint8* dst_argb,
|
||||
int pix);
|
||||
|
||||
void RGB24ToARGBRow_Any_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix);
|
||||
void RAWToARGBRow_Any_NEON(const uint8* src_raw, uint8* dst_argb, int pix);
|
||||
void RGB565ToARGBRow_Any_NEON(const uint8* src_rgb565, uint8* dst_argb,
|
||||
@ -905,6 +909,13 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
|
||||
void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
|
||||
void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
|
||||
|
||||
void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb,
|
||||
const uint32 dither4, int pix);
|
||||
void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
|
||||
const uint32 dither4, int pix);
|
||||
void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,
|
||||
const uint32 dither4, int pix);
|
||||
|
||||
void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
|
||||
void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
|
||||
void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
|
||||
@ -914,6 +925,8 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
|
||||
void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
|
||||
void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
|
||||
void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
|
||||
void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
|
||||
const uint32 dither4, int width);
|
||||
|
||||
void ARGBToRGBARow_C(const uint8* src_argb, uint8* dst_rgb, int pix);
|
||||
void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
|
||||
@ -922,14 +935,13 @@ void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
|
||||
void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
|
||||
void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
|
||||
|
||||
void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb,
|
||||
const uint8* dither8x8, int pix);
|
||||
|
||||
void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
|
||||
void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int pix);
|
||||
void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix);
|
||||
void I400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
|
||||
void I400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int pix);
|
||||
void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
|
||||
void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix);
|
||||
void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int pix);
|
||||
void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix);
|
||||
void J400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
|
||||
void J400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int pix);
|
||||
void J400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int pix);
|
||||
|
||||
void I444ToARGBRow_C(const uint8* src_y,
|
||||
const uint8* src_u,
|
||||
@ -1038,6 +1050,11 @@ void I444ToARGBRow_SSSE3(const uint8* src_y,
|
||||
const uint8* src_v,
|
||||
uint8* dst_argb,
|
||||
int width);
|
||||
void I444ToARGBRow_AVX2(const uint8* src_y,
|
||||
const uint8* src_u,
|
||||
const uint8* src_v,
|
||||
uint8* dst_argb,
|
||||
int width);
|
||||
void I422ToARGBRow_SSSE3(const uint8* src_y,
|
||||
const uint8* src_u,
|
||||
const uint8* src_v,
|
||||
@ -1048,6 +1065,11 @@ void I411ToARGBRow_SSSE3(const uint8* src_y,
|
||||
const uint8* src_v,
|
||||
uint8* dst_argb,
|
||||
int width);
|
||||
void I411ToARGBRow_AVX2(const uint8* src_y,
|
||||
const uint8* src_u,
|
||||
const uint8* src_v,
|
||||
uint8* dst_argb,
|
||||
int width);
|
||||
void NV12ToARGBRow_SSSE3(const uint8* src_y,
|
||||
const uint8* src_uv,
|
||||
uint8* dst_argb,
|
||||
@ -1097,6 +1119,11 @@ void J422ToARGBRow_SSSE3(const uint8* src_y,
|
||||
const uint8* src_v,
|
||||
uint8* dst_argb,
|
||||
int width);
|
||||
void J422ToARGBRow_AVX2(const uint8* src_y,
|
||||
const uint8* src_u,
|
||||
const uint8* src_v,
|
||||
uint8* dst_argb,
|
||||
int width);
|
||||
void I422ToBGRARow_SSSE3(const uint8* src_y,
|
||||
const uint8* src_u,
|
||||
const uint8* src_v,
|
||||
@ -1147,11 +1174,21 @@ void I422ToRGB24Row_SSSE3(const uint8* src_y,
|
||||
const uint8* src_v,
|
||||
uint8* dst_rgb24,
|
||||
int width);
|
||||
void I422ToRGB24Row_AVX2(const uint8* src_y,
|
||||
const uint8* src_u,
|
||||
const uint8* src_v,
|
||||
uint8* dst_rgb24,
|
||||
int width);
|
||||
void I422ToRAWRow_SSSE3(const uint8* src_y,
|
||||
const uint8* src_u,
|
||||
const uint8* src_v,
|
||||
uint8* dst_raw,
|
||||
int width);
|
||||
void I422ToRAWRow_AVX2(const uint8* src_y,
|
||||
const uint8* src_u,
|
||||
const uint8* src_v,
|
||||
uint8* dst_raw,
|
||||
int width);
|
||||
void I422ToARGBRow_Any_AVX2(const uint8* src_y,
|
||||
const uint8* src_u,
|
||||
const uint8* src_v,
|
||||
@ -1177,6 +1214,11 @@ void I444ToARGBRow_Any_SSSE3(const uint8* src_y,
|
||||
const uint8* src_v,
|
||||
uint8* dst_argb,
|
||||
int width);
|
||||
void I444ToARGBRow_Any_AVX2(const uint8* src_y,
|
||||
const uint8* src_u,
|
||||
const uint8* src_v,
|
||||
uint8* dst_argb,
|
||||
int width);
|
||||
void I422ToARGBRow_Any_SSSE3(const uint8* src_y,
|
||||
const uint8* src_u,
|
||||
const uint8* src_v,
|
||||
@ -1187,6 +1229,11 @@ void I411ToARGBRow_Any_SSSE3(const uint8* src_y,
|
||||
const uint8* src_v,
|
||||
uint8* dst_argb,
|
||||
int width);
|
||||
void I411ToARGBRow_Any_AVX2(const uint8* src_y,
|
||||
const uint8* src_u,
|
||||
const uint8* src_v,
|
||||
uint8* dst_argb,
|
||||
int width);
|
||||
void NV12ToARGBRow_Any_SSSE3(const uint8* src_y,
|
||||
const uint8* src_uv,
|
||||
uint8* dst_argb,
|
||||
@ -1231,6 +1278,16 @@ void YUY2ToARGBRow_Any_AVX2(const uint8* src_yuy2,
|
||||
void UYVYToARGBRow_Any_AVX2(const uint8* src_uyvy,
|
||||
uint8* dst_argb,
|
||||
int width);
|
||||
void J422ToARGBRow_Any_SSSE3(const uint8* src_y,
|
||||
const uint8* src_u,
|
||||
const uint8* src_v,
|
||||
uint8* dst_argb,
|
||||
int width);
|
||||
void J422ToARGBRow_Any_AVX2(const uint8* src_y,
|
||||
const uint8* src_u,
|
||||
const uint8* src_v,
|
||||
uint8* dst_argb,
|
||||
int width);
|
||||
void I422ToBGRARow_Any_SSSE3(const uint8* src_y,
|
||||
const uint8* src_u,
|
||||
const uint8* src_v,
|
||||
@ -1281,33 +1338,29 @@ void I422ToRGB24Row_Any_SSSE3(const uint8* src_y,
|
||||
const uint8* src_v,
|
||||
uint8* dst_argb,
|
||||
int width);
|
||||
void I422ToRGB24Row_Any_AVX2(const uint8* src_y,
|
||||
const uint8* src_u,
|
||||
const uint8* src_v,
|
||||
uint8* dst_argb,
|
||||
int width);
|
||||
void I422ToRAWRow_Any_SSSE3(const uint8* src_y,
|
||||
const uint8* src_u,
|
||||
const uint8* src_v,
|
||||
uint8* dst_argb,
|
||||
int width);
|
||||
void I422ToRAWRow_Any_AVX2(const uint8* src_y,
|
||||
const uint8* src_u,
|
||||
const uint8* src_v,
|
||||
uint8* dst_argb,
|
||||
int width);
|
||||
|
||||
void YToARGBRow_C(const uint8* src_y,
|
||||
uint8* dst_argb,
|
||||
int width);
|
||||
void YToARGBRow_SSE2(const uint8* src_y,
|
||||
uint8* dst_argb,
|
||||
int width);
|
||||
void YToARGBRow_AVX2(const uint8* src_y,
|
||||
uint8* dst_argb,
|
||||
int width);
|
||||
void YToARGBRow_NEON(const uint8* src_y,
|
||||
uint8* dst_argb,
|
||||
int width);
|
||||
void YToARGBRow_Any_SSE2(const uint8* src_y,
|
||||
uint8* dst_argb,
|
||||
int width);
|
||||
void YToARGBRow_Any_AVX2(const uint8* src_y,
|
||||
uint8* dst_argb,
|
||||
int width);
|
||||
void YToARGBRow_Any_NEON(const uint8* src_y,
|
||||
uint8* dst_argb,
|
||||
int width);
|
||||
void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width);
|
||||
void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
|
||||
void I400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);
|
||||
void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width);
|
||||
void I400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int width);
|
||||
void I400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int width);
|
||||
void I400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int width);
|
||||
|
||||
// ARGB preattenuated alpha blend.
|
||||
void ARGBBlendRow_SSSE3(const uint8* src_argb, const uint8* src_argb1,
|
||||
@ -1375,6 +1428,11 @@ void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
|
||||
void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
|
||||
void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
|
||||
|
||||
void ARGBToRGB565DitherRow_Any_SSE2(const uint8* src_argb, uint8* dst_rgb,
|
||||
const uint32 dither4, int pix);
|
||||
void ARGBToRGB565DitherRow_Any_AVX2(const uint8* src_argb, uint8* dst_rgb,
|
||||
const uint32 dither4, int pix);
|
||||
|
||||
void ARGBToRGB565Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
|
||||
void ARGBToARGB1555Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
|
||||
void ARGBToARGB4444Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
|
||||
@ -1384,6 +1442,8 @@ void ARGBToRAWRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
|
||||
void ARGBToRGB565Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
|
||||
void ARGBToARGB1555Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
|
||||
void ARGBToARGB4444Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
|
||||
void ARGBToRGB565DitherRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb,
|
||||
const uint32 dither4, int width);
|
||||
|
||||
void I444ToARGBRow_Any_NEON(const uint8* src_y,
|
||||
const uint8* src_u,
|
||||
@ -1570,17 +1630,6 @@ void UYVYToUVRow_Any_NEON(const uint8* src_uyvy, int stride_uyvy,
|
||||
void UYVYToUV422Row_Any_NEON(const uint8* src_uyvy,
|
||||
uint8* dst_u, uint8* dst_v, int pix);
|
||||
|
||||
void ARGBToBayerGGRow_C(const uint8* src_argb, uint8* dst_bayer,
|
||||
uint32 /* selector */, int pix);
|
||||
void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
|
||||
uint32 /* selector */, int pix);
|
||||
void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
|
||||
uint32 /* selector */, int pix);
|
||||
void ARGBToBayerGGRow_Any_SSE2(const uint8* src_argb, uint8* dst_bayer,
|
||||
uint32 /* selector */, int pix);
|
||||
void ARGBToBayerGGRow_Any_NEON(const uint8* src_argb, uint8* dst_bayer,
|
||||
uint32 /* selector */, int pix);
|
||||
|
||||
void I422ToYUY2Row_C(const uint8* src_y,
|
||||
const uint8* src_u,
|
||||
const uint8* src_v,
|
||||
@ -1770,6 +1819,18 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
|
||||
uint8* dst_argb, int width);
|
||||
void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
|
||||
uint8* dst_argb, int width);
|
||||
void SobelRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
|
||||
uint8* dst_argb, int width);
|
||||
void SobelRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely,
|
||||
uint8* dst_argb, int width);
|
||||
void SobelToPlaneRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
|
||||
uint8* dst_y, int width);
|
||||
void SobelToPlaneRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely,
|
||||
uint8* dst_y, int width);
|
||||
void SobelXYRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
|
||||
uint8* dst_argb, int width);
|
||||
void SobelXYRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely,
|
||||
uint8* dst_argb, int width);
|
||||
|
||||
void ARGBPolynomialRow_C(const uint8* src_argb,
|
||||
uint8* dst_argb, const float* poly,
|
||||
|
238
third_party/libyuv/include/libyuv/scale_row.h
vendored
238
third_party/libyuv/include/libyuv/scale_row.h
vendored
@ -12,45 +12,66 @@
|
||||
#define INCLUDE_LIBYUV_SCALE_ROW_H_
|
||||
|
||||
#include "libyuv/basic_types.h"
|
||||
#include "libyuv/scale.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#if defined(__pnacl__) || defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \
|
||||
defined(TARGET_IPHONE_SIMULATOR)
|
||||
#if defined(__pnacl__) || defined(__CLR_VER) || \
|
||||
(defined(__i386__) && !defined(__SSE2__))
|
||||
#define LIBYUV_DISABLE_X86
|
||||
#endif
|
||||
|
||||
// Visual C 2012 required for AVX2.
|
||||
#if defined(_M_IX86) && !defined(__clang__) && \
|
||||
defined(_MSC_VER) && _MSC_VER >= 1700
|
||||
#define VISUALC_HAS_AVX2 1
|
||||
#endif // VisualStudio >= 2012
|
||||
|
||||
// The following are available on all x86 platforms:
|
||||
#if !defined(LIBYUV_DISABLE_X86) && \
|
||||
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
|
||||
#define HAS_SCALEROWDOWN2_SSE2
|
||||
#define HAS_SCALEROWDOWN4_SSE2
|
||||
#define HAS_SCALEROWDOWN34_SSSE3
|
||||
#define HAS_SCALEROWDOWN38_SSSE3
|
||||
#define HAS_SCALEADDROWS_SSE2
|
||||
#define HAS_SCALEFILTERCOLS_SSSE3
|
||||
#define HAS_SCALECOLSUP2_SSE2
|
||||
#define HAS_FIXEDDIV1_X86
|
||||
#define HAS_FIXEDDIV_X86
|
||||
#define HAS_SCALEARGBCOLS_SSE2
|
||||
#define HAS_SCALEARGBCOLSUP2_SSE2
|
||||
#define HAS_SCALEARGBFILTERCOLS_SSSE3
|
||||
#define HAS_SCALEARGBROWDOWN2_SSE2
|
||||
#define HAS_SCALEARGBROWDOWNEVEN_SSE2
|
||||
#define HAS_SCALEARGBCOLS_SSE2
|
||||
#define HAS_SCALEARGBFILTERCOLS_SSSE3
|
||||
#define HAS_SCALEARGBCOLSUP2_SSE2
|
||||
#define HAS_FIXEDDIV_X86
|
||||
#define HAS_FIXEDDIV1_X86
|
||||
#define HAS_SCALECOLSUP2_SSE2
|
||||
#define HAS_SCALEFILTERCOLS_SSSE3
|
||||
#define HAS_SCALEROWDOWN2_SSE2
|
||||
#define HAS_SCALEROWDOWN34_SSSE3
|
||||
#define HAS_SCALEROWDOWN38_SSSE3
|
||||
#define HAS_SCALEROWDOWN4_SSE2
|
||||
#endif
|
||||
|
||||
// The following are available on VS2012:
|
||||
#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2)
|
||||
#define HAS_SCALEADDROW_AVX2
|
||||
#define HAS_SCALEROWDOWN2_AVX2
|
||||
#define HAS_SCALEROWDOWN4_AVX2
|
||||
#endif
|
||||
|
||||
// The following are available on Visual C:
|
||||
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && !defined(__clang__)
|
||||
#define HAS_SCALEADDROW_SSE2
|
||||
#endif
|
||||
|
||||
// The following are available on Neon platforms:
|
||||
#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
|
||||
(defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
|
||||
#define HAS_SCALEARGBCOLS_NEON
|
||||
#define HAS_SCALEARGBROWDOWN2_NEON
|
||||
#define HAS_SCALEARGBROWDOWNEVEN_NEON
|
||||
#define HAS_SCALEFILTERCOLS_NEON
|
||||
#define HAS_SCALEROWDOWN2_NEON
|
||||
#define HAS_SCALEROWDOWN4_NEON
|
||||
#define HAS_SCALEROWDOWN34_NEON
|
||||
#define HAS_SCALEROWDOWN38_NEON
|
||||
#define HAS_SCALEARGBROWDOWNEVEN_NEON
|
||||
#define HAS_SCALEARGBROWDOWN2_NEON
|
||||
#define HAS_SCALEROWDOWN4_NEON
|
||||
#define HAS_SCALEARGBFILTERCOLS_NEON
|
||||
#endif
|
||||
|
||||
// The following are available on Mips platforms:
|
||||
@ -164,10 +185,8 @@ void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
|
||||
uint16* dst_ptr, int dst_width);
|
||||
void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint16* dst_ptr, int src_width, int src_height);
|
||||
void ScaleAddRows_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
|
||||
uint32* dst_ptr, int src_width, int src_height);
|
||||
void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width);
|
||||
void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width);
|
||||
void ScaleARGBRowDown2_C(const uint8* src_argb,
|
||||
ptrdiff_t src_stride,
|
||||
uint8* dst_argb, int dst_width);
|
||||
@ -194,16 +213,28 @@ void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,
|
||||
void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,
|
||||
int dst_width, int x, int dx);
|
||||
|
||||
// Specialized scalers for x86.
|
||||
void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
|
||||
void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
|
||||
@ -220,46 +251,124 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
|
||||
void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint16* dst_ptr, int src_width,
|
||||
int src_height);
|
||||
void ScaleRowDown2_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
void ScaleRowDown2Linear_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
void ScaleRowDown2Box_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
void ScaleRowDown2_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
void ScaleRowDown2Linear_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
void ScaleRowDown2Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
void ScaleRowDown4_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
void ScaleRowDown4Box_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
void ScaleRowDown4_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
void ScaleRowDown4Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
|
||||
void ScaleRowDown34_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
void ScaleRowDown34_1_Box_Any_SSSE3(const uint8* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
void ScaleRowDown34_0_Box_Any_SSSE3(const uint8* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
void ScaleRowDown38_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
void ScaleRowDown38_3_Box_Any_SSSE3(const uint8* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
void ScaleRowDown38_2_Box_Any_SSSE3(const uint8* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
|
||||
void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
|
||||
void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
|
||||
void ScaleAddRow_Any_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
|
||||
void ScaleAddRow_Any_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
|
||||
|
||||
void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
||||
int dst_width, int x, int dx);
|
||||
void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
|
||||
int dst_width, int x, int dx);
|
||||
void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
|
||||
ptrdiff_t src_stride,
|
||||
uint8* dst_argb, int dst_width);
|
||||
void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
|
||||
ptrdiff_t src_stride,
|
||||
uint8* dst_argb, int dst_width);
|
||||
void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
|
||||
ptrdiff_t src_stride,
|
||||
uint8* dst_argb, int dst_width);
|
||||
void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
|
||||
int src_stepx,
|
||||
uint8* dst_argb, int dst_width);
|
||||
void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
|
||||
ptrdiff_t src_stride,
|
||||
int src_stepx,
|
||||
uint8* dst_argb, int dst_width);
|
||||
|
||||
|
||||
// ARGB Column functions
|
||||
void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
|
||||
int dst_width, int x, int dx);
|
||||
void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
|
||||
int dst_width, int x, int dx);
|
||||
void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
|
||||
int dst_width, int x, int dx);
|
||||
// Row functions.
|
||||
void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
|
||||
int dst_width, int x, int dx);
|
||||
void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
|
||||
int dst_width, int x, int dx);
|
||||
void ScaleARGBFilterCols_Any_NEON(uint8* dst_argb, const uint8* src_argb,
|
||||
int dst_width, int x, int dx);
|
||||
void ScaleARGBCols_Any_NEON(uint8* dst_argb, const uint8* src_argb,
|
||||
int dst_width, int x, int dx);
|
||||
|
||||
// ARGB Row functions
|
||||
void ScaleARGBRowDown2_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
|
||||
uint8* dst_argb, int dst_width);
|
||||
void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
|
||||
uint8* dst_argb, int dst_width);
|
||||
void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
|
||||
uint8* dst_argb, int dst_width);
|
||||
void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst, int dst_width);
|
||||
void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
|
||||
uint8* dst_argb, int dst_width);
|
||||
void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst, int dst_width);
|
||||
void ScaleARGBRowDown2_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
|
||||
uint8* dst_argb, int dst_width);
|
||||
void ScaleARGBRowDown2Linear_Any_SSE2(const uint8* src_argb,
|
||||
ptrdiff_t src_stride,
|
||||
uint8* dst_argb, int dst_width);
|
||||
void ScaleARGBRowDown2Box_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
|
||||
uint8* dst_argb, int dst_width);
|
||||
void ScaleARGBRowDown2_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst, int dst_width);
|
||||
void ScaleARGBRowDown2Linear_Any_NEON(const uint8* src_argb,
|
||||
ptrdiff_t src_stride,
|
||||
uint8* dst_argb, int dst_width);
|
||||
void ScaleARGBRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst, int dst_width);
|
||||
|
||||
void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
|
||||
int src_stepx, uint8* dst_argb, int dst_width);
|
||||
void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
|
||||
int src_stepx,
|
||||
uint8* dst_argb, int dst_width);
|
||||
void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
|
||||
int src_stepx,
|
||||
uint8* dst_argb, int dst_width);
|
||||
void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
|
||||
int src_stepx,
|
||||
uint8* dst_argb, int dst_width);
|
||||
void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst, int dst_width);
|
||||
void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst, int dst_width);
|
||||
void ScaleARGBRowDownEven_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
|
||||
int src_stepx,
|
||||
uint8* dst_argb, int dst_width);
|
||||
void ScaleARGBRowDownEvenBox_Any_SSE2(const uint8* src_argb,
|
||||
ptrdiff_t src_stride,
|
||||
int src_stepx,
|
||||
uint8* dst_argb, int dst_width);
|
||||
void ScaleARGBRowDownEven_Any_NEON(const uint8* src_argb, ptrdiff_t src_stride,
|
||||
int src_stepx,
|
||||
uint8* dst_argb, int dst_width);
|
||||
void ScaleARGBRowDownEvenBox_Any_NEON(const uint8* src_argb,
|
||||
ptrdiff_t src_stride,
|
||||
int src_stepx,
|
||||
uint8* dst_argb, int dst_width);
|
||||
|
||||
// ScaleRowDown2Box also used by planar functions
|
||||
// NEON downscalers with interpolation.
|
||||
@ -267,7 +376,8 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
// Note - not static due to reuse in convert for 444 to 420.
|
||||
void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst, int dst_width);
|
||||
|
||||
void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst, int dst_width);
|
||||
void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst, int dst_width);
|
||||
|
||||
@ -302,6 +412,42 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
|
||||
void ScaleRowDown2_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst, int dst_width);
|
||||
void ScaleRowDown2Linear_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst, int dst_width);
|
||||
void ScaleRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst, int dst_width);
|
||||
void ScaleRowDown4_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
void ScaleRowDown4Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
void ScaleRowDown34_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
void ScaleRowDown34_0_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
void ScaleRowDown34_1_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
// 32 -> 12
|
||||
void ScaleRowDown38_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
// 32x3 -> 12x1
|
||||
void ScaleRowDown38_3_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
// 32x2 -> 12x1
|
||||
void ScaleRowDown38_2_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
|
||||
void ScaleAddRow_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width);
|
||||
void ScaleAddRow_Any_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width);
|
||||
|
||||
void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
|
||||
int dst_width, int x, int dx);
|
||||
|
||||
void ScaleFilterCols_Any_NEON(uint8* dst_ptr, const uint8* src_ptr,
|
||||
int dst_width, int x, int dx);
|
||||
|
||||
|
||||
void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst, int dst_width);
|
||||
void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
|
2
third_party/libyuv/include/libyuv/version.h
vendored
2
third_party/libyuv/include/libyuv/version.h
vendored
@ -11,6 +11,6 @@
|
||||
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define LIBYUV_VERSION 1305
|
||||
#define LIBYUV_VERSION 1456
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
|
||||
|
6
third_party/libyuv/source/compare.cc
vendored
6
third_party/libyuv/source/compare.cc
vendored
@ -37,7 +37,7 @@ uint32 HashDjb2_C(const uint8* src, int count, uint32 seed);
|
||||
#define HAS_HASHDJB2_SSE41
|
||||
uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed);
|
||||
|
||||
#if _MSC_VER >= 1700
|
||||
#ifdef VISUALC_HAS_AVX2
|
||||
#define HAS_HASHDJB2_AVX2
|
||||
uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed);
|
||||
#endif
|
||||
@ -138,8 +138,8 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
|
||||
#define HAS_SUMSQUAREERROR_SSE2
|
||||
uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count);
|
||||
#endif
|
||||
// Visual C 2012 required for AVX2.
|
||||
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && _MSC_VER >= 1700
|
||||
|
||||
#ifdef VISUALC_HAS_AVX2
|
||||
#define HAS_SUMSQUAREERROR_AVX2
|
||||
uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count);
|
||||
#endif
|
||||
|
2
third_party/libyuv/source/compare_neon64.cc
vendored
2
third_party/libyuv/source/compare_neon64.cc
vendored
@ -32,7 +32,7 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
|
||||
"ld1 {v0.16b}, [%0], #16 \n"
|
||||
MEMACCESS(1)
|
||||
"ld1 {v1.16b}, [%1], #16 \n"
|
||||
"subs %2, %2, #16 \n"
|
||||
"subs %w2, %w2, #16 \n"
|
||||
"usubl v2.8h, v0.8b, v1.8b \n"
|
||||
"usubl2 v3.8h, v0.16b, v1.16b \n"
|
||||
"smlal v16.4s, v2.4h, v2.4h \n"
|
||||
|
15
third_party/libyuv/source/compare_win.cc
vendored
15
third_party/libyuv/source/compare_win.cc
vendored
@ -16,9 +16,11 @@ namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
|
||||
// This module is for Visual C x86.
|
||||
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
|
||||
defined(_MSC_VER) && !defined(__clang__)
|
||||
|
||||
__declspec(naked) __declspec(align(16))
|
||||
__declspec(naked)
|
||||
uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src_a
|
||||
@ -59,7 +61,7 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
|
||||
#if _MSC_VER >= 1700
|
||||
// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
|
||||
#pragma warning(disable: 4752)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
__declspec(naked)
|
||||
uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src_a
|
||||
@ -133,7 +135,7 @@ static uvec32 kHashMul3 = {
|
||||
#define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \
|
||||
_asm _emit 0x40 _asm _emit reg
|
||||
|
||||
__declspec(naked) __declspec(align(16))
|
||||
__declspec(naked)
|
||||
uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src
|
||||
@ -184,7 +186,7 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
|
||||
|
||||
// Visual C 2012 required for AVX2.
|
||||
#if _MSC_VER >= 1700
|
||||
__declspec(naked) __declspec(align(16))
|
||||
__declspec(naked)
|
||||
uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src
|
||||
@ -219,8 +221,7 @@ uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
|
||||
}
|
||||
}
|
||||
#endif // _MSC_VER >= 1700
|
||||
|
||||
#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
|
||||
#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
|
237
third_party/libyuv/source/convert.cc
vendored
237
third_party/libyuv/source/convert.cc
vendored
@ -817,22 +817,20 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
|
||||
src_stride_rgb24 = -src_stride_rgb24;
|
||||
}
|
||||
|
||||
// Neon version does direct RGB24 to YUV.
|
||||
#if defined(HAS_RGB24TOYROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
RGB24ToUVRow = RGB24ToUVRow_Any_NEON;
|
||||
RGB24ToYRow = RGB24ToYRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
RGB24ToYRow = RGB24ToYRow_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
RGB24ToUVRow = RGB24ToUVRow_NEON;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_RGB24TOUVROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
RGB24ToUVRow = RGB24ToUVRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
RGB24ToUVRow = RGB24ToUVRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
// Other platforms do intermediate conversion from RGB24 to ARGB.
|
||||
#else
|
||||
#if defined(HAS_RGB24TOARGBROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
|
||||
@ -841,27 +839,29 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUVROW_SSSE3)
|
||||
#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
ARGBToUVRow = ARGBToUVRow_SSSE3;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUVROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
ARGBToYRow = ARGBToYRow_Any_SSSE3;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
ARGBToUVRow = ARGBToUVRow_SSSE3;
|
||||
ARGBToYRow = ARGBToYRow_SSSE3;
|
||||
}
|
||||
}
|
||||
#endif // HAS_ARGBTOUVROW_SSSE3
|
||||
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
|
||||
ARGBToYRow = ARGBToYRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
ARGBToUVRow = ARGBToUVRow_AVX2;
|
||||
ARGBToYRow = ARGBToYRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
{
|
||||
#if !defined(HAS_RGB24TOYROW_NEON)
|
||||
// Allocate 2 rows of ARGB.
|
||||
const int kRowSize = (width * 4 + 15) & ~15;
|
||||
const int kRowSize = (width * 4 + 31) & ~31;
|
||||
align_buffer_64(row, kRowSize * 2);
|
||||
#endif
|
||||
|
||||
@ -894,8 +894,8 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
|
||||
}
|
||||
#if !defined(HAS_RGB24TOYROW_NEON)
|
||||
free_aligned_buffer_64(row);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -931,22 +931,20 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw,
|
||||
src_stride_raw = -src_stride_raw;
|
||||
}
|
||||
|
||||
// Neon version does direct RAW to YUV.
|
||||
#if defined(HAS_RAWTOYROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
RAWToUVRow = RAWToUVRow_Any_NEON;
|
||||
RAWToYRow = RAWToYRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
RAWToYRow = RAWToYRow_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
RAWToUVRow = RAWToUVRow_NEON;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_RAWTOUVROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
RAWToUVRow = RAWToUVRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
RAWToUVRow = RAWToUVRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
// Other platforms do intermediate conversion from RAW to ARGB.
|
||||
#else
|
||||
#if defined(HAS_RAWTOARGBROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
|
||||
@ -955,59 +953,63 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUVROW_SSSE3)
|
||||
#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
ARGBToUVRow = ARGBToUVRow_SSSE3;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUVROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
ARGBToYRow = ARGBToYRow_Any_SSSE3;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
ARGBToUVRow = ARGBToUVRow_SSSE3;
|
||||
ARGBToYRow = ARGBToYRow_SSSE3;
|
||||
}
|
||||
}
|
||||
#endif // HAS_ARGBTOUVROW_SSSE3
|
||||
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
|
||||
ARGBToYRow = ARGBToYRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
ARGBToUVRow = ARGBToUVRow_AVX2;
|
||||
ARGBToYRow = ARGBToYRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
{
|
||||
// Allocate 2 rows of ARGB.
|
||||
const int kRowSize = (width * 4 + 15) & ~15;
|
||||
const int kRowSize = (width * 4 + 31) & ~31;
|
||||
align_buffer_64(row, kRowSize * 2);
|
||||
#endif
|
||||
|
||||
for (y = 0; y < height - 1; y += 2) {
|
||||
#if defined(HAS_RAWTOYROW_NEON)
|
||||
#if defined(HAS_RAWTOYROW_NEON)
|
||||
RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width);
|
||||
RAWToYRow(src_raw, dst_y, width);
|
||||
RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
|
||||
#else
|
||||
#else
|
||||
RAWToARGBRow(src_raw, row, width);
|
||||
RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width);
|
||||
ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
|
||||
ARGBToYRow(row, dst_y, width);
|
||||
ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
|
||||
#endif
|
||||
#endif
|
||||
src_raw += src_stride_raw * 2;
|
||||
dst_y += dst_stride_y * 2;
|
||||
dst_u += dst_stride_u;
|
||||
dst_v += dst_stride_v;
|
||||
}
|
||||
if (height & 1) {
|
||||
#if defined(HAS_RAWTOYROW_NEON)
|
||||
#if defined(HAS_RAWTOYROW_NEON)
|
||||
RAWToUVRow(src_raw, 0, dst_u, dst_v, width);
|
||||
RAWToYRow(src_raw, dst_y, width);
|
||||
#else
|
||||
#else
|
||||
RAWToARGBRow(src_raw, row, width);
|
||||
ARGBToUVRow(row, 0, dst_u, dst_v, width);
|
||||
ARGBToYRow(row, dst_y, width);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
#if !defined(HAS_RAWTOYROW_NEON)
|
||||
#if !defined(HAS_RAWTOYROW_NEON)
|
||||
free_aligned_buffer_64(row);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -1043,19 +1045,20 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
|
||||
src_stride_rgb565 = -src_stride_rgb565;
|
||||
}
|
||||
|
||||
// Neon version does direct RGB565 to YUV.
|
||||
#if defined(HAS_RGB565TOYROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
RGB565ToUVRow = RGB565ToUVRow_Any_NEON;
|
||||
RGB565ToYRow = RGB565ToYRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
RGB565ToYRow = RGB565ToYRow_NEON;
|
||||
}
|
||||
RGB565ToUVRow = RGB565ToUVRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
RGB565ToUVRow = RGB565ToUVRow_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
RGB565ToUVRow = RGB565ToUVRow_NEON;
|
||||
}
|
||||
}
|
||||
}
|
||||
#else // HAS_RGB565TOYROW_NEON
|
||||
|
||||
// Other platforms do intermediate conversion from RGB565 to ARGB.
|
||||
#else
|
||||
#if defined(HAS_RGB565TOARGBROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
|
||||
@ -1064,28 +1067,37 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUVROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
|
||||
#if defined(HAS_RGB565TOARGBROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
ARGBToUVRow = ARGBToUVRow_SSSE3;
|
||||
RGB565ToARGBRow = RGB565ToARGBRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUVROW_SSSE3)
|
||||
#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
|
||||
ARGBToYRow = ARGBToYRow_Any_SSSE3;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
ARGBToUVRow = ARGBToUVRow_SSSE3;
|
||||
ARGBToYRow = ARGBToYRow_SSSE3;
|
||||
}
|
||||
}
|
||||
#endif // HAS_ARGBTOUVROW_SSSE3
|
||||
#endif // HAS_RGB565TOYROW_NEON
|
||||
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
|
||||
ARGBToYRow = ARGBToYRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
ARGBToUVRow = ARGBToUVRow_AVX2;
|
||||
ARGBToYRow = ARGBToYRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
{
|
||||
#if !defined(HAS_RGB565TOYROW_NEON)
|
||||
// Allocate 2 rows of ARGB.
|
||||
const int kRowSize = (width * 4 + 15) & ~15;
|
||||
const int kRowSize = (width * 4 + 31) & ~31;
|
||||
align_buffer_64(row, kRowSize * 2);
|
||||
#endif
|
||||
|
||||
@ -1118,8 +1130,8 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
|
||||
}
|
||||
#if !defined(HAS_RGB565TOYROW_NEON)
|
||||
free_aligned_buffer_64(row);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -1155,19 +1167,20 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
|
||||
src_stride_argb1555 = -src_stride_argb1555;
|
||||
}
|
||||
|
||||
// Neon version does direct ARGB1555 to YUV.
|
||||
#if defined(HAS_ARGB1555TOYROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ARGB1555ToUVRow = ARGB1555ToUVRow_Any_NEON;
|
||||
ARGB1555ToYRow = ARGB1555ToYRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
ARGB1555ToYRow = ARGB1555ToYRow_NEON;
|
||||
}
|
||||
ARGB1555ToUVRow = ARGB1555ToUVRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
ARGB1555ToUVRow = ARGB1555ToUVRow_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
ARGB1555ToUVRow = ARGB1555ToUVRow_NEON;
|
||||
}
|
||||
}
|
||||
}
|
||||
#else // HAS_ARGB1555TOYROW_NEON
|
||||
|
||||
// Other platforms do intermediate conversion from ARGB1555 to ARGB.
|
||||
#else
|
||||
#if defined(HAS_ARGB1555TOARGBROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
|
||||
@ -1176,30 +1189,40 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUVROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
|
||||
#if defined(HAS_ARGB1555TOARGBROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
ARGBToUVRow = ARGBToUVRow_SSSE3;
|
||||
ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUVROW_SSSE3)
|
||||
#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
|
||||
ARGBToYRow = ARGBToYRow_Any_SSSE3;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
ARGBToUVRow = ARGBToUVRow_SSSE3;
|
||||
ARGBToYRow = ARGBToYRow_SSSE3;
|
||||
}
|
||||
}
|
||||
#endif // HAS_ARGBTOUVROW_SSSE3
|
||||
#endif // HAS_ARGB1555TOYROW_NEON
|
||||
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
|
||||
ARGBToYRow = ARGBToYRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
ARGBToUVRow = ARGBToUVRow_AVX2;
|
||||
ARGBToYRow = ARGBToYRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
{
|
||||
#if !defined(HAS_ARGB1555TOYROW_NEON)
|
||||
// Allocate 2 rows of ARGB.
|
||||
const int kRowSize = (width * 4 + 15) & ~15;
|
||||
const int kRowSize = (width * 4 + 31) & ~31;
|
||||
align_buffer_64(row, kRowSize * 2);
|
||||
#endif
|
||||
|
||||
for (y = 0; y < height - 1; y += 2) {
|
||||
#if defined(HAS_ARGB1555TOYROW_NEON)
|
||||
ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width);
|
||||
@ -1230,9 +1253,9 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
|
||||
#endif
|
||||
}
|
||||
#if !defined(HAS_ARGB1555TOYROW_NEON)
|
||||
free_aligned_buffer_64(row);
|
||||
#endif
|
||||
free_aligned_buffer_64(row);
|
||||
}
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -1268,19 +1291,20 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
|
||||
src_stride_argb4444 = -src_stride_argb4444;
|
||||
}
|
||||
|
||||
// Neon version does direct ARGB4444 to YUV.
|
||||
#if defined(HAS_ARGB4444TOYROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ARGB4444ToUVRow = ARGB4444ToUVRow_Any_NEON;
|
||||
ARGB4444ToYRow = ARGB4444ToYRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
ARGB4444ToYRow = ARGB4444ToYRow_NEON;
|
||||
}
|
||||
ARGB4444ToUVRow = ARGB4444ToUVRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
ARGB4444ToUVRow = ARGB4444ToUVRow_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
ARGB4444ToUVRow = ARGB4444ToUVRow_NEON;
|
||||
}
|
||||
}
|
||||
}
|
||||
#else // HAS_ARGB4444TOYROW_NEON
|
||||
|
||||
// Other platforms do intermediate conversion from ARGB4444 to ARGB.
|
||||
#else
|
||||
#if defined(HAS_ARGB4444TOARGBROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
|
||||
@ -1289,28 +1313,37 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUVROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
|
||||
#if defined(HAS_ARGB4444TOARGBROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
ARGBToUVRow = ARGBToUVRow_SSSE3;
|
||||
ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUVROW_SSSE3)
|
||||
#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
|
||||
ARGBToYRow = ARGBToYRow_Any_SSSE3;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
ARGBToUVRow = ARGBToUVRow_SSSE3;
|
||||
ARGBToYRow = ARGBToYRow_SSSE3;
|
||||
}
|
||||
}
|
||||
#endif // HAS_ARGBTOUVROW_SSSE3
|
||||
#endif // HAS_ARGB4444TOYROW_NEON
|
||||
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
|
||||
ARGBToYRow = ARGBToYRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
ARGBToUVRow = ARGBToUVRow_AVX2;
|
||||
ARGBToYRow = ARGBToYRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
{
|
||||
#if !defined(HAS_ARGB4444TOYROW_NEON)
|
||||
// Allocate 2 rows of ARGB.
|
||||
const int kRowSize = (width * 4 + 15) & ~15;
|
||||
const int kRowSize = (width * 4 + 31) & ~31;
|
||||
align_buffer_64(row, kRowSize * 2);
|
||||
#endif
|
||||
|
||||
@ -1345,8 +1378,8 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
|
||||
}
|
||||
#if !defined(HAS_ARGB4444TOYROW_NEON)
|
||||
free_aligned_buffer_64(row);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
100
third_party/libyuv/source/convert_argb.cc
vendored
100
third_party/libyuv/source/convert_argb.cc
vendored
@ -85,6 +85,14 @@ int I444ToARGB(const uint8* src_y, int src_stride_y,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I444TOARGBROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
I444ToARGBRow = I444ToARGBRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
I444ToARGBRow = I444ToARGBRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I444TOARGBROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
I444ToARGBRow = I444ToARGBRow_Any_NEON;
|
||||
@ -222,6 +230,14 @@ int I411ToARGB(const uint8* src_y, int src_stride_y,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I411TOARGBROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
I411ToARGBRow = I411ToARGBRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
I411ToARGBRow = I411ToARGBRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I411TOARGBROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
I411ToARGBRow = I411ToARGBRow_Any_NEON;
|
||||
@ -243,13 +259,13 @@ int I411ToARGB(const uint8* src_y, int src_stride_y,
|
||||
|
||||
// Convert I400 to ARGB.
|
||||
LIBYUV_API
|
||||
int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
|
||||
uint8* dst_argb, int dst_stride_argb,
|
||||
int width, int height) {
|
||||
int I400ToARGB(const uint8* src_y, int src_stride_y,
|
||||
uint8* dst_argb, int dst_stride_argb,
|
||||
int width, int height) {
|
||||
int y;
|
||||
void (*YToARGBRow)(const uint8* y_buf,
|
||||
void (*I400ToARGBRow)(const uint8* y_buf,
|
||||
uint8* rgb_buf,
|
||||
int width) = YToARGBRow_C;
|
||||
int width) = I400ToARGBRow_C;
|
||||
if (!src_y || !dst_argb ||
|
||||
width <= 0 || height == 0) {
|
||||
return -1;
|
||||
@ -267,47 +283,47 @@ int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
|
||||
height = 1;
|
||||
src_stride_y = dst_stride_argb = 0;
|
||||
}
|
||||
#if defined(HAS_YTOARGBROW_SSE2)
|
||||
#if defined(HAS_I400TOARGBROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
YToARGBRow = YToARGBRow_Any_SSE2;
|
||||
I400ToARGBRow = I400ToARGBRow_Any_SSE2;
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
YToARGBRow = YToARGBRow_SSE2;
|
||||
I400ToARGBRow = I400ToARGBRow_SSE2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_YTOARGBROW_AVX2)
|
||||
#if defined(HAS_I400TOARGBROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
YToARGBRow = YToARGBRow_Any_AVX2;
|
||||
I400ToARGBRow = I400ToARGBRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
YToARGBRow = YToARGBRow_AVX2;
|
||||
I400ToARGBRow = I400ToARGBRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_YTOARGBROW_NEON)
|
||||
#if defined(HAS_I400TOARGBROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
YToARGBRow = YToARGBRow_Any_NEON;
|
||||
I400ToARGBRow = I400ToARGBRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
YToARGBRow = YToARGBRow_NEON;
|
||||
I400ToARGBRow = I400ToARGBRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (y = 0; y < height; ++y) {
|
||||
YToARGBRow(src_y, dst_argb, width);
|
||||
I400ToARGBRow(src_y, dst_argb, width);
|
||||
dst_argb += dst_stride_argb;
|
||||
src_y += src_stride_y;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Convert I400 to ARGB.
|
||||
// Convert J400 to ARGB.
|
||||
LIBYUV_API
|
||||
int I400ToARGB(const uint8* src_y, int src_stride_y,
|
||||
int J400ToARGB(const uint8* src_y, int src_stride_y,
|
||||
uint8* dst_argb, int dst_stride_argb,
|
||||
int width, int height) {
|
||||
int y;
|
||||
void (*I400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix) =
|
||||
I400ToARGBRow_C;
|
||||
void (*J400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix) =
|
||||
J400ToARGBRow_C;
|
||||
if (!src_y || !dst_argb ||
|
||||
width <= 0 || height == 0) {
|
||||
return -1;
|
||||
@ -325,24 +341,32 @@ int I400ToARGB(const uint8* src_y, int src_stride_y,
|
||||
height = 1;
|
||||
src_stride_y = dst_stride_argb = 0;
|
||||
}
|
||||
#if defined(HAS_I400TOARGBROW_SSE2)
|
||||
#if defined(HAS_J400TOARGBROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
I400ToARGBRow = I400ToARGBRow_Any_SSE2;
|
||||
J400ToARGBRow = J400ToARGBRow_Any_SSE2;
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
I400ToARGBRow = I400ToARGBRow_SSE2;
|
||||
J400ToARGBRow = J400ToARGBRow_SSE2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I400TOARGBROW_NEON)
|
||||
#if defined(HAS_J400TOARGBROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
J400ToARGBRow = J400ToARGBRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
J400ToARGBRow = J400ToARGBRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_J400TOARGBROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
I400ToARGBRow = I400ToARGBRow_Any_NEON;
|
||||
J400ToARGBRow = J400ToARGBRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
I400ToARGBRow = I400ToARGBRow_NEON;
|
||||
J400ToARGBRow = J400ToARGBRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
for (y = 0; y < height; ++y) {
|
||||
I400ToARGBRow(src_y, dst_argb, width);
|
||||
J400ToARGBRow(src_y, dst_argb, width);
|
||||
src_y += src_stride_y;
|
||||
dst_argb += dst_stride_argb;
|
||||
}
|
||||
@ -552,6 +576,14 @@ int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_RGB565TOARGBROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
RGB565ToARGBRow = RGB565ToARGBRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_RGB565TOARGBROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
RGB565ToARGBRow = RGB565ToARGBRow_Any_NEON;
|
||||
@ -602,6 +634,14 @@ int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGB1555TOARGBROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGB1555TOARGBROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_NEON;
|
||||
@ -652,6 +692,14 @@ int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGB4444TOARGBROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGB4444TOARGBROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_NEON;
|
||||
|
127
third_party/libyuv/source/convert_from.cc
vendored
127
third_party/libyuv/source/convert_from.cc
vendored
@ -739,6 +739,14 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I422TORGB24ROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
I422ToRGB24Row = I422ToRGB24Row_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
I422ToRGB24Row = I422ToRGB24Row_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I422TORGB24ROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
I422ToRGB24Row = I422ToRGB24Row_Any_NEON;
|
||||
@ -791,6 +799,14 @@ int I420ToRAW(const uint8* src_y, int src_stride_y,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I422TORAWROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
I422ToRAWRow = I422ToRAWRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
I422ToRAWRow = I422ToRAWRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I422TORAWROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
I422ToRAWRow = I422ToRAWRow_Any_NEON;
|
||||
@ -993,6 +1009,117 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y,
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Ordered 8x8 dither for 888 to 565. Values from 0 to 7.
|
||||
static const uint8 kDither565_4x4[16] = {
|
||||
0, 4, 1, 5,
|
||||
6, 2, 7, 3,
|
||||
1, 5, 0, 4,
|
||||
7, 3, 6, 2,
|
||||
};
|
||||
|
||||
// Convert I420 to RGB565 with dithering.
|
||||
LIBYUV_API
|
||||
int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
|
||||
const uint8* src_u, int src_stride_u,
|
||||
const uint8* src_v, int src_stride_v,
|
||||
uint8* dst_rgb565, int dst_stride_rgb565,
|
||||
const uint8* dither4x4, int width, int height) {
|
||||
int y;
|
||||
void (*I422ToARGBRow)(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width) = I422ToARGBRow_C;
|
||||
void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb,
|
||||
const uint32 dither4, int pix) = ARGBToRGB565DitherRow_C;
|
||||
if (!src_y || !src_u || !src_v || !dst_rgb565 ||
|
||||
width <= 0 || height == 0) {
|
||||
return -1;
|
||||
}
|
||||
// Negative height means invert the image.
|
||||
if (height < 0) {
|
||||
height = -height;
|
||||
dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
|
||||
dst_stride_rgb565 = -dst_stride_rgb565;
|
||||
}
|
||||
if (!dither4x4) {
|
||||
dither4x4 = kDither565_4x4;
|
||||
}
|
||||
#if defined(HAS_I422TOARGBROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
I422ToARGBRow = I422ToARGBRow_SSSE3;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I422TOARGBROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
I422ToARGBRow = I422ToARGBRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
I422ToARGBRow = I422ToARGBRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I422TOARGBROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
I422ToARGBRow = I422ToARGBRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
I422ToARGBRow = I422ToARGBRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
|
||||
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
|
||||
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
|
||||
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
|
||||
IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2)) {
|
||||
I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
|
||||
if (IS_ALIGNED(width, 4)) {
|
||||
ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
{
|
||||
// Allocate a row of argb.
|
||||
align_buffer_64(row_argb, width * 4);
|
||||
for (y = 0; y < height; ++y) {
|
||||
I422ToARGBRow(src_y, src_u, src_v, row_argb, width);
|
||||
ARGBToRGB565DitherRow(row_argb, dst_rgb565,
|
||||
*(uint32*)(dither4x4 + ((y & 3) << 2)), width);
|
||||
dst_rgb565 += dst_stride_rgb565;
|
||||
src_y += src_stride_y;
|
||||
if (y & 1) {
|
||||
src_u += src_stride_u;
|
||||
src_v += src_stride_v;
|
||||
}
|
||||
}
|
||||
free_aligned_buffer_64(row_argb);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Convert I420 to specified format
|
||||
LIBYUV_API
|
||||
int ConvertFromI420(const uint8* y, int y_stride,
|
||||
|
127
third_party/libyuv/source/convert_from_argb.cc
vendored
127
third_party/libyuv/source/convert_from_argb.cc
vendored
@ -72,7 +72,14 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb,
|
||||
ARGBToYRow = ARGBToYRow_SSSE3;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOYROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
ARGBToYRow = ARGBToYRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
ARGBToYRow = ARGBToYRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOYROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
@ -139,7 +146,6 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(HAS_ARGBTOYROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
ARGBToYRow = ARGBToYRow_Any_SSSE3;
|
||||
@ -148,6 +154,14 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOYROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
ARGBToYRow = ARGBToYRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
ARGBToYRow = ARGBToYRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOYROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ARGBToYRow = ARGBToYRow_Any_NEON;
|
||||
@ -275,6 +289,16 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
|
||||
ARGBToYRow = ARGBToYRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
ARGBToUVRow = ARGBToUVRow_AVX2;
|
||||
ARGBToYRow = ARGBToYRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOYROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ARGBToYRow = ARGBToYRow_Any_NEON;
|
||||
@ -317,8 +341,8 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
|
||||
#endif
|
||||
{
|
||||
// Allocate a rows of uv.
|
||||
align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2);
|
||||
uint8* row_v = row_u + ((halfwidth + 15) & ~15);
|
||||
align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
|
||||
uint8* row_v = row_u + ((halfwidth + 31) & ~31);
|
||||
|
||||
for (y = 0; y < height - 1; y += 2) {
|
||||
ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
|
||||
@ -374,6 +398,16 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
|
||||
ARGBToYRow = ARGBToYRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
ARGBToUVRow = ARGBToUVRow_AVX2;
|
||||
ARGBToYRow = ARGBToYRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOYROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ARGBToYRow = ARGBToYRow_Any_NEON;
|
||||
@ -416,8 +450,8 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
|
||||
#endif
|
||||
{
|
||||
// Allocate a rows of uv.
|
||||
align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2);
|
||||
uint8* row_v = row_u + ((halfwidth + 15) & ~15);
|
||||
align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
|
||||
uint8* row_v = row_u + ((halfwidth + 31) & ~31);
|
||||
|
||||
for (y = 0; y < height - 1; y += 2) {
|
||||
ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
|
||||
@ -492,6 +526,14 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOYROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
ARGBToYRow = ARGBToYRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
ARGBToYRow = ARGBToYRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOYROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ARGBToYRow = ARGBToYRow_Any_NEON;
|
||||
@ -591,6 +633,14 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOYROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
ARGBToYRow = ARGBToYRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
ARGBToYRow = ARGBToYRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOYROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ARGBToYRow = ARGBToYRow_Any_NEON;
|
||||
@ -804,25 +854,22 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const uint8 kDither8x8[64] = {
|
||||
0, 128, 32, 160, 8, 136, 40, 168,
|
||||
192, 64, 224, 96, 200, 72, 232, 104,
|
||||
48, 176, 16, 144, 56, 184, 24, 152,
|
||||
240, 112, 208, 80, 248, 120, 216, 88,
|
||||
12, 140, 44, 172, 4, 132, 36, 164,
|
||||
204, 76, 236, 108, 196, 68, 228, 100,
|
||||
60, 188, 28, 156, 52, 180, 20, 148,
|
||||
252, 124, 220, 92, 244, 116, 212, 84,
|
||||
// Ordered 8x8 dither for 888 to 565. Values from 0 to 7.
|
||||
static const uint8 kDither565_4x4[16] = {
|
||||
0, 4, 1, 5,
|
||||
6, 2, 7, 3,
|
||||
1, 5, 0, 4,
|
||||
7, 3, 6, 2,
|
||||
};
|
||||
|
||||
// Convert ARGB To RGB565 with 8x8 dither matrix (64 bytes).
|
||||
// Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes).
|
||||
LIBYUV_API
|
||||
int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
|
||||
uint8* dst_rgb565, int dst_stride_rgb565,
|
||||
const uint8* dither8x8, int width, int height) {
|
||||
const uint8* dither4x4, int width, int height) {
|
||||
int y;
|
||||
void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb,
|
||||
const uint8* dither8x8, int pix) = ARGBToRGB565DitherRow_C;
|
||||
const uint32 dither4, int pix) = ARGBToRGB565DitherRow_C;
|
||||
if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
|
||||
return -1;
|
||||
}
|
||||
@ -831,13 +878,36 @@ int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
|
||||
src_argb = src_argb + (height - 1) * src_stride_argb;
|
||||
src_stride_argb = -src_stride_argb;
|
||||
}
|
||||
if (!dither8x8) {
|
||||
dither8x8 = kDither8x8;
|
||||
|
||||
if (!dither4x4) {
|
||||
dither4x4 = kDither565_4x4;
|
||||
}
|
||||
#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
|
||||
if (IS_ALIGNED(width, 4)) {
|
||||
ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
for (y = 0; y < height; ++y) {
|
||||
ARGBToRGB565DitherRow(src_argb, dst_rgb565,
|
||||
dither8x8 + ((y & 7) << 3), width);
|
||||
*(uint32*)(dither4x4 + ((y & 3) << 2)), width);
|
||||
src_argb += src_stride_argb;
|
||||
dst_rgb565 += dst_stride_rgb565;
|
||||
}
|
||||
@ -845,6 +915,7 @@ int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
|
||||
}
|
||||
|
||||
// Convert ARGB To RGB565.
|
||||
// TODO(fbarchard): Consider using dither function low level with zeros.
|
||||
LIBYUV_API
|
||||
int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
|
||||
uint8* dst_rgb565, int dst_stride_rgb565,
|
||||
@ -1021,7 +1092,7 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
|
||||
int width, int height) {
|
||||
int y;
|
||||
void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;
|
||||
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;
|
||||
void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) =
|
||||
ARGBToYJRow_C;
|
||||
if (!src_argb ||
|
||||
@ -1045,7 +1116,7 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOYJROW_AVX2) && defined(HAS_ARGBTOUVJROW_AVX2)
|
||||
#if defined(HAS_ARGBTOYJROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
ARGBToYJRow = ARGBToYJRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
@ -1140,6 +1211,14 @@ int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOYJROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
ARGBToYJRow = ARGBToYJRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
ARGBToYJRow = ARGBToYJRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOYJROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ARGBToYJRow = ARGBToYJRow_Any_NEON;
|
||||
|
75
third_party/libyuv/source/cpu_id.cc
vendored
75
third_party/libyuv/source/cpu_id.cc
vendored
@ -10,13 +10,12 @@
|
||||
|
||||
#include "libyuv/cpu_id.h"
|
||||
|
||||
#if defined(_MSC_VER) && !defined(__clang__)
|
||||
#if (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__)
|
||||
#include <intrin.h> // For __cpuidex()
|
||||
#endif
|
||||
#if !defined(__pnacl__) && !defined(__CLR_VER) && \
|
||||
!defined(__native_client__) && \
|
||||
defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219) && \
|
||||
(defined(_M_IX86) || defined(_M_X64))
|
||||
!defined(__native_client__) && (defined(_M_IX86) || defined(_M_X64)) && \
|
||||
defined(_MSC_VER) && !defined(__clang__) && (_MSC_FULL_VER >= 160040219)
|
||||
#include <immintrin.h> // For _xgetbv()
|
||||
#endif
|
||||
|
||||
@ -37,23 +36,23 @@ extern "C" {
|
||||
|
||||
// For functions that use the stack and have runtime checks for overflow,
|
||||
// use SAFEBUFFERS to avoid additional check.
|
||||
#if defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
|
||||
#if (defined(_MSC_VER) && !defined(__clang__)) && (_MSC_FULL_VER >= 160040219)
|
||||
#define SAFEBUFFERS __declspec(safebuffers)
|
||||
#else
|
||||
#define SAFEBUFFERS
|
||||
#endif
|
||||
|
||||
// Low level cpuid for X86. Returns zeros on other CPUs.
|
||||
#if !defined(__pnacl__) && !defined(__CLR_VER) && \
|
||||
(defined(_M_IX86) || defined(_M_X64) || \
|
||||
defined(__i386__) || defined(__x86_64__))
|
||||
// Low level cpuid for X86.
|
||||
#if (defined(_M_IX86) || defined(_M_X64) || \
|
||||
defined(__i386__) || defined(__x86_64__)) && \
|
||||
!defined(__pnacl__) && !defined(__CLR_VER)
|
||||
LIBYUV_API
|
||||
void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
|
||||
#if defined(_MSC_VER) && !defined(__clang__)
|
||||
#if (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__)
|
||||
// Visual C version uses intrinsic or inline x86 assembly.
|
||||
#if (_MSC_FULL_VER >= 160040219)
|
||||
__cpuidex((int*)(cpu_info), info_eax, info_ecx);
|
||||
#endif
|
||||
#if defined(_M_IX86)
|
||||
#elif defined(_M_IX86)
|
||||
__asm {
|
||||
mov eax, info_eax
|
||||
mov ecx, info_ecx
|
||||
@ -71,7 +70,8 @@ void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
|
||||
cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0;
|
||||
}
|
||||
#endif
|
||||
#else // defined(_MSC_VER)
|
||||
// GCC version uses inline x86 assembly.
|
||||
#else // (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__)
|
||||
uint32 info_ebx, info_edx;
|
||||
asm volatile ( // NOLINT
|
||||
#if defined( __i386__) && defined(__PIC__)
|
||||
@ -89,37 +89,38 @@ void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
|
||||
cpu_info[1] = info_ebx;
|
||||
cpu_info[2] = info_ecx;
|
||||
cpu_info[3] = info_edx;
|
||||
#endif // defined(_MSC_VER)
|
||||
#endif // (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__)
|
||||
}
|
||||
|
||||
#if !defined(__native_client__)
|
||||
#define HAS_XGETBV
|
||||
// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
|
||||
int TestOsSaveYmm() {
|
||||
uint32 xcr0 = 0u;
|
||||
#if defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
|
||||
xcr0 = (uint32)(_xgetbv(0)); // VS2010 SP1 required.
|
||||
#endif
|
||||
#if defined(_M_IX86) && defined(_MSC_VER)
|
||||
__asm {
|
||||
xor ecx, ecx // xcr 0
|
||||
_asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // For VS2010 and earlier.
|
||||
mov xcr0, eax
|
||||
}
|
||||
#endif
|
||||
#if defined(__i386__) || defined(__x86_64__)
|
||||
asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx");
|
||||
#endif // defined(_MSC_VER)
|
||||
return((xcr0 & 6) == 6); // Is ymm saved?
|
||||
}
|
||||
#endif // !defined(__native_client__)
|
||||
#else
|
||||
#else // (defined(_M_IX86) || defined(_M_X64) ...
|
||||
LIBYUV_API
|
||||
void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) {
|
||||
cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
// TODO(fbarchard): Enable xgetbv when validator supports it.
|
||||
#if (defined(_M_IX86) || defined(_M_X64) || \
|
||||
defined(__i386__) || defined(__x86_64__)) && \
|
||||
!defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__)
|
||||
#define HAS_XGETBV
|
||||
// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
|
||||
int TestOsSaveYmm() {
|
||||
uint32 xcr0 = 0u;
|
||||
#if (defined(_MSC_VER) && !defined(__clang__)) && (_MSC_FULL_VER >= 160040219)
|
||||
xcr0 = (uint32)(_xgetbv(0)); // VS2010 SP1 required.
|
||||
#elif defined(_M_IX86) && defined(_MSC_VER) && !defined(__clang__)
|
||||
__asm {
|
||||
xor ecx, ecx // xcr 0
|
||||
_asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // For VS2010 and earlier.
|
||||
mov xcr0, eax
|
||||
}
|
||||
#elif defined(__i386__) || defined(__x86_64__)
|
||||
asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx");
|
||||
#endif // defined(__i386__) || defined(__x86_64__)
|
||||
return((xcr0 & 6) == 6); // Is ymm saved?
|
||||
}
|
||||
#endif // defined(_M_IX86) || defined(_M_X64) ..
|
||||
|
||||
// based on libvpx arm_cpudetect.c
|
||||
// For Arm, but public to allow testing on any CPU
|
||||
LIBYUV_API SAFEBUFFERS
|
||||
|
6
third_party/libyuv/source/mjpeg_decoder.cc
vendored
6
third_party/libyuv/source/mjpeg_decoder.cc
vendored
@ -18,6 +18,12 @@
|
||||
// Must be included before jpeglib.
|
||||
#include <setjmp.h>
|
||||
#define HAVE_SETJMP
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
// disable warning 4324: structure was padded due to __declspec(align())
|
||||
#pragma warning(disable:4324)
|
||||
#endif
|
||||
|
||||
#endif
|
||||
struct FILE; // For jpeglib.h.
|
||||
|
||||
|
2
third_party/libyuv/source/mjpeg_validate.cc
vendored
2
third_party/libyuv/source/mjpeg_validate.cc
vendored
@ -23,7 +23,7 @@ extern "C" {
|
||||
#ifdef ENABLE_SCASB
|
||||
|
||||
// Multiple of 1.
|
||||
__declspec(naked) __declspec(align(16))
|
||||
__declspec(naked)
|
||||
const uint8* ScanRow_ERMS(const uint8* src, uint32 val, int count) {
|
||||
__asm {
|
||||
mov edx, edi
|
||||
|
303
third_party/libyuv/source/planar_functions.cc
vendored
303
third_party/libyuv/source/planar_functions.cc
vendored
@ -528,7 +528,7 @@ int ARGBMirror(const uint8* src_argb, int src_stride_argb,
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Get a blender that optimized for the CPU, alignment and pixel count.
|
||||
// Get a blender that optimized for the CPU and pixel count.
|
||||
// As there are 6 blenders to choose from, the caller should try to use
|
||||
// the same blend function for all pixels if possible.
|
||||
LIBYUV_API
|
||||
@ -677,12 +677,12 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
|
||||
height = 1;
|
||||
src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
|
||||
}
|
||||
#if defined(HAS_ARGBADDROW_SSE2) && defined(_MSC_VER)
|
||||
#if defined(HAS_ARGBADDROW_SSE2) && (defined(_MSC_VER) && !defined(__clang__))
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
ARGBAddRow = ARGBAddRow_SSE2;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBADDROW_SSE2) && !defined(_MSC_VER)
|
||||
#if defined(HAS_ARGBADDROW_SSE2) && !(defined(_MSC_VER) && !defined(__clang__))
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
ARGBAddRow = ARGBAddRow_Any_SSE2;
|
||||
if (IS_ALIGNED(width, 4)) {
|
||||
@ -1976,8 +1976,8 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
|
||||
const uint8* src_sobely,
|
||||
uint8* dst, int width)) {
|
||||
int y;
|
||||
void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer,
|
||||
uint32 selector, int pix) = ARGBToBayerGGRow_C;
|
||||
void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_g, int pix) =
|
||||
ARGBToYJRow_C;
|
||||
void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1,
|
||||
uint8* dst_sobely, int width) = SobelYRow_C;
|
||||
void (*SobelXRow)(const uint8* src_y0, const uint8* src_y1,
|
||||
@ -1993,31 +1993,32 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
|
||||
src_argb = src_argb + (height - 1) * src_stride_argb;
|
||||
src_stride_argb = -src_stride_argb;
|
||||
}
|
||||
// ARGBToBayer used to select G channel from ARGB.
|
||||
#if defined(HAS_ARGBTOBAYERGGROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
ARGBToBayerRow = ARGBToBayerGGRow_Any_SSE2;
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
ARGBToBayerRow = ARGBToBayerGGRow_SSE2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOBAYERROW_SSSE3)
|
||||
|
||||
#if defined(HAS_ARGBTOYJROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3;
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
ARGBToBayerRow = ARGBToBayerRow_SSSE3;
|
||||
ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
ARGBToYJRow = ARGBToYJRow_SSSE3;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOBAYERGGROW_NEON)
|
||||
#if defined(HAS_ARGBTOYJROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
ARGBToYJRow = ARGBToYJRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
ARGBToYJRow = ARGBToYJRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOYJROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ARGBToBayerRow = ARGBToBayerGGRow_Any_NEON;
|
||||
ARGBToYJRow = ARGBToYJRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
ARGBToBayerRow = ARGBToBayerGGRow_NEON;
|
||||
ARGBToYJRow = ARGBToYJRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(HAS_SOBELYROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
SobelYRow = SobelYRow_SSE2;
|
||||
@ -2040,7 +2041,7 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
|
||||
#endif
|
||||
{
|
||||
// 3 rows with edges before/after.
|
||||
const int kRowSize = (width + kEdge + 15) & ~15;
|
||||
const int kRowSize = (width + kEdge + 31) & ~31;
|
||||
align_buffer_64(rows, kRowSize * 2 + (kEdge + kRowSize * 3 + kEdge));
|
||||
uint8* row_sobelx = rows;
|
||||
uint8* row_sobely = rows + kRowSize;
|
||||
@ -2050,20 +2051,20 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
|
||||
uint8* row_y0 = row_y + kEdge;
|
||||
uint8* row_y1 = row_y0 + kRowSize;
|
||||
uint8* row_y2 = row_y1 + kRowSize;
|
||||
ARGBToBayerRow(src_argb, row_y0, 0x0d090501, width);
|
||||
ARGBToYJRow(src_argb, row_y0, width);
|
||||
row_y0[-1] = row_y0[0];
|
||||
memset(row_y0 + width, row_y0[width - 1], 16); // Extrude 16 for valgrind.
|
||||
ARGBToBayerRow(src_argb, row_y1, 0x0d090501, width);
|
||||
ARGBToYJRow(src_argb, row_y1, width);
|
||||
row_y1[-1] = row_y1[0];
|
||||
memset(row_y1 + width, row_y1[width - 1], 16);
|
||||
memset(row_y2 + width, 0, 16);
|
||||
|
||||
for (y = 0; y < height; ++y) {
|
||||
// Convert next row of ARGB to Y.
|
||||
// Convert next row of ARGB to G.
|
||||
if (y < (height - 1)) {
|
||||
src_argb += src_stride_argb;
|
||||
}
|
||||
ARGBToBayerRow(src_argb, row_y2, 0x0d090501, width);
|
||||
ARGBToYJRow(src_argb, row_y2, width);
|
||||
row_y2[-1] = row_y2[0];
|
||||
row_y2[width] = row_y2[width - 1];
|
||||
|
||||
@ -2094,13 +2095,19 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb,
|
||||
void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely,
|
||||
uint8* dst_argb, int width) = SobelRow_C;
|
||||
#if defined(HAS_SOBELROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) {
|
||||
SobelRow = SobelRow_SSE2;
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
SobelRow = SobelRow_Any_SSE2;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
SobelRow = SobelRow_SSE2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SOBELROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
|
||||
SobelRow = SobelRow_NEON;
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
SobelRow = SobelRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
SobelRow = SobelRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
|
||||
@ -2115,13 +2122,19 @@ int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb,
|
||||
void (*SobelToPlaneRow)(const uint8* src_sobelx, const uint8* src_sobely,
|
||||
uint8* dst_, int width) = SobelToPlaneRow_C;
|
||||
#if defined(HAS_SOBELTOPLANEROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) {
|
||||
SobelToPlaneRow = SobelToPlaneRow_SSE2;
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
SobelToPlaneRow = SobelToPlaneRow_Any_SSE2;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
SobelToPlaneRow = SobelToPlaneRow_SSE2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SOBELTOPLANEROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
|
||||
SobelToPlaneRow = SobelToPlaneRow_NEON;
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
SobelToPlaneRow = SobelToPlaneRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
SobelToPlaneRow = SobelToPlaneRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y,
|
||||
@ -2137,13 +2150,19 @@ int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
|
||||
void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely,
|
||||
uint8* dst_argb, int width) = SobelXYRow_C;
|
||||
#if defined(HAS_SOBELXYROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) {
|
||||
SobelXYRow = SobelXYRow_SSE2;
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
SobelXYRow = SobelXYRow_Any_SSE2;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
SobelXYRow = SobelXYRow_SSE2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SOBELXYROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
|
||||
SobelXYRow = SobelXYRow_NEON;
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
SobelXYRow = SobelXYRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
SobelXYRow = SobelXYRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
|
||||
@ -2322,6 +2341,214 @@ int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
|
||||
return 0;
|
||||
}
|
||||
|
||||
LIBYUV_API
|
||||
int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2,
|
||||
uint8* dst_y, int dst_stride_y,
|
||||
uint8* dst_uv, int dst_stride_uv,
|
||||
int width, int height) {
|
||||
int y;
|
||||
int halfwidth = (width + 1) >> 1;
|
||||
void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) =
|
||||
SplitUVRow_C;
|
||||
void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
|
||||
ptrdiff_t src_stride, int dst_width,
|
||||
int source_y_fraction) = InterpolateRow_C;
|
||||
if (!src_yuy2 ||
|
||||
!dst_y || !dst_uv ||
|
||||
width <= 0 || height == 0) {
|
||||
return -1;
|
||||
}
|
||||
// Negative height means invert the image.
|
||||
if (height < 0) {
|
||||
height = -height;
|
||||
src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
|
||||
src_stride_yuy2 = -src_stride_yuy2;
|
||||
}
|
||||
#if defined(HAS_SPLITUVROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
SplitUVRow = SplitUVRow_Any_SSE2;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
SplitUVRow = SplitUVRow_SSE2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SPLITUVROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
SplitUVRow = SplitUVRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
SplitUVRow = SplitUVRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SPLITUVROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
SplitUVRow = SplitUVRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
SplitUVRow = SplitUVRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_INTERPOLATEROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
InterpolateRow = InterpolateRow_Any_SSE2;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
InterpolateRow = InterpolateRow_SSE2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_INTERPOLATEROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
InterpolateRow = InterpolateRow_Any_SSSE3;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
InterpolateRow = InterpolateRow_SSSE3;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_INTERPOLATEROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
InterpolateRow = InterpolateRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
InterpolateRow = InterpolateRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_INTERPOLATEROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
InterpolateRow = InterpolateRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
InterpolateRow = InterpolateRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
{
|
||||
int awidth = halfwidth * 2;
|
||||
// 2 rows of uv
|
||||
align_buffer_64(rows, awidth * 2);
|
||||
|
||||
for (y = 0; y < height - 1; y += 2) {
|
||||
// Split Y from UV.
|
||||
SplitUVRow(src_yuy2, dst_y, rows, awidth);
|
||||
SplitUVRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y,
|
||||
rows + awidth, awidth);
|
||||
InterpolateRow(dst_uv, rows, awidth, awidth, 128);
|
||||
src_yuy2 += src_stride_yuy2 * 2;
|
||||
dst_y += dst_stride_y * 2;
|
||||
dst_uv += dst_stride_uv;
|
||||
}
|
||||
if (height & 1) {
|
||||
// Split Y from UV.
|
||||
SplitUVRow(src_yuy2, dst_y, dst_uv, width);
|
||||
}
|
||||
free_aligned_buffer_64(rows);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
LIBYUV_API
|
||||
int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy,
|
||||
uint8* dst_y, int dst_stride_y,
|
||||
uint8* dst_uv, int dst_stride_uv,
|
||||
int width, int height) {
|
||||
int y;
|
||||
int halfwidth = (width + 1) >> 1;
|
||||
void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) =
|
||||
SplitUVRow_C;
|
||||
void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
|
||||
ptrdiff_t src_stride, int dst_width,
|
||||
int source_y_fraction) = InterpolateRow_C;
|
||||
if (!src_uyvy ||
|
||||
!dst_y || !dst_uv ||
|
||||
width <= 0 || height == 0) {
|
||||
return -1;
|
||||
}
|
||||
// Negative height means invert the image.
|
||||
if (height < 0) {
|
||||
height = -height;
|
||||
src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
|
||||
src_stride_uyvy = -src_stride_uyvy;
|
||||
}
|
||||
#if defined(HAS_SPLITUVROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
SplitUVRow = SplitUVRow_Any_SSE2;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
SplitUVRow = SplitUVRow_SSE2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SPLITUVROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
SplitUVRow = SplitUVRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
SplitUVRow = SplitUVRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SPLITUVROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
SplitUVRow = SplitUVRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
SplitUVRow = SplitUVRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_INTERPOLATEROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
InterpolateRow = InterpolateRow_Any_SSE2;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
InterpolateRow = InterpolateRow_SSE2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_INTERPOLATEROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
InterpolateRow = InterpolateRow_Any_SSSE3;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
InterpolateRow = InterpolateRow_SSSE3;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_INTERPOLATEROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
InterpolateRow = InterpolateRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
InterpolateRow = InterpolateRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_INTERPOLATEROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
InterpolateRow = InterpolateRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
InterpolateRow = InterpolateRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
{
|
||||
int awidth = halfwidth * 2;
|
||||
// 2 rows of uv
|
||||
align_buffer_64(rows, awidth * 2);
|
||||
|
||||
for (y = 0; y < height - 1; y += 2) {
|
||||
// Split Y from UV.
|
||||
SplitUVRow(src_uyvy, rows, dst_y, awidth);
|
||||
SplitUVRow(src_uyvy + src_stride_uyvy, rows + awidth,
|
||||
dst_y + dst_stride_y, awidth);
|
||||
InterpolateRow(dst_uv, rows, awidth, awidth, 128);
|
||||
src_uyvy += src_stride_uyvy * 2;
|
||||
dst_y += dst_stride_y * 2;
|
||||
dst_uv += dst_stride_uv;
|
||||
}
|
||||
if (height & 1) {
|
||||
// Split Y from UV.
|
||||
SplitUVRow(src_uyvy, dst_y, dst_uv, width);
|
||||
}
|
||||
free_aligned_buffer_64(rows);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
|
865
third_party/libyuv/source/rotate.cc
vendored
865
third_party/libyuv/source/rotate.cc
vendored
@ -13,6 +13,7 @@
|
||||
#include "libyuv/cpu_id.h"
|
||||
#include "libyuv/convert.h"
|
||||
#include "libyuv/planar_functions.h"
|
||||
#include "libyuv/rotate_row.h"
|
||||
#include "libyuv/row.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
@ -20,809 +21,39 @@ namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#if !defined(LIBYUV_DISABLE_X86) && \
|
||||
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
|
||||
#if defined(__APPLE__) && defined(__i386__)
|
||||
#define DECLARE_FUNCTION(name) \
|
||||
".text \n" \
|
||||
".private_extern _" #name " \n" \
|
||||
".align 4,0x90 \n" \
|
||||
"_" #name ": \n"
|
||||
#elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__)
|
||||
#define DECLARE_FUNCTION(name) \
|
||||
".text \n" \
|
||||
".align 4,0x90 \n" \
|
||||
"_" #name ": \n"
|
||||
#else
|
||||
#define DECLARE_FUNCTION(name) \
|
||||
".text \n" \
|
||||
".align 4,0x90 \n" \
|
||||
#name ": \n"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
|
||||
(defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
|
||||
#define HAS_TRANSPOSE_WX8_NEON
|
||||
void TransposeWx8_NEON(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride, int width);
|
||||
#define HAS_TRANSPOSE_UVWX8_NEON
|
||||
void TransposeUVWx8_NEON(const uint8* src, int src_stride,
|
||||
uint8* dst_a, int dst_stride_a,
|
||||
uint8* dst_b, int dst_stride_b,
|
||||
int width);
|
||||
#endif
|
||||
|
||||
#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
|
||||
defined(__mips__) && \
|
||||
defined(__mips_dsp) && (__mips_dsp_rev >= 2)
|
||||
#define HAS_TRANSPOSE_WX8_MIPS_DSPR2
|
||||
void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride, int width);
|
||||
|
||||
void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride, int width);
|
||||
#define HAS_TRANSPOSE_UVWx8_MIPS_DSPR2
|
||||
void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
|
||||
uint8* dst_a, int dst_stride_a,
|
||||
uint8* dst_b, int dst_stride_b,
|
||||
int width);
|
||||
#endif // defined(__mips__)
|
||||
|
||||
#if !defined(LIBYUV_DISABLE_X86) && \
|
||||
defined(_M_IX86) && defined(_MSC_VER)
|
||||
#define HAS_TRANSPOSE_WX8_SSSE3
|
||||
__declspec(naked) __declspec(align(16))
|
||||
static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride, int width) {
|
||||
__asm {
|
||||
push edi
|
||||
push esi
|
||||
push ebp
|
||||
mov eax, [esp + 12 + 4] // src
|
||||
mov edi, [esp + 12 + 8] // src_stride
|
||||
mov edx, [esp + 12 + 12] // dst
|
||||
mov esi, [esp + 12 + 16] // dst_stride
|
||||
mov ecx, [esp + 12 + 20] // width
|
||||
|
||||
// Read in the data from the source pointer.
|
||||
// First round of bit swap.
|
||||
align 4
|
||||
convertloop:
|
||||
movq xmm0, qword ptr [eax]
|
||||
lea ebp, [eax + 8]
|
||||
movq xmm1, qword ptr [eax + edi]
|
||||
lea eax, [eax + 2 * edi]
|
||||
punpcklbw xmm0, xmm1
|
||||
movq xmm2, qword ptr [eax]
|
||||
movdqa xmm1, xmm0
|
||||
palignr xmm1, xmm1, 8
|
||||
movq xmm3, qword ptr [eax + edi]
|
||||
lea eax, [eax + 2 * edi]
|
||||
punpcklbw xmm2, xmm3
|
||||
movdqa xmm3, xmm2
|
||||
movq xmm4, qword ptr [eax]
|
||||
palignr xmm3, xmm3, 8
|
||||
movq xmm5, qword ptr [eax + edi]
|
||||
punpcklbw xmm4, xmm5
|
||||
lea eax, [eax + 2 * edi]
|
||||
movdqa xmm5, xmm4
|
||||
movq xmm6, qword ptr [eax]
|
||||
palignr xmm5, xmm5, 8
|
||||
movq xmm7, qword ptr [eax + edi]
|
||||
punpcklbw xmm6, xmm7
|
||||
mov eax, ebp
|
||||
movdqa xmm7, xmm6
|
||||
palignr xmm7, xmm7, 8
|
||||
// Second round of bit swap.
|
||||
punpcklwd xmm0, xmm2
|
||||
punpcklwd xmm1, xmm3
|
||||
movdqa xmm2, xmm0
|
||||
movdqa xmm3, xmm1
|
||||
palignr xmm2, xmm2, 8
|
||||
palignr xmm3, xmm3, 8
|
||||
punpcklwd xmm4, xmm6
|
||||
punpcklwd xmm5, xmm7
|
||||
movdqa xmm6, xmm4
|
||||
movdqa xmm7, xmm5
|
||||
palignr xmm6, xmm6, 8
|
||||
palignr xmm7, xmm7, 8
|
||||
// Third round of bit swap.
|
||||
// Write to the destination pointer.
|
||||
punpckldq xmm0, xmm4
|
||||
movq qword ptr [edx], xmm0
|
||||
movdqa xmm4, xmm0
|
||||
palignr xmm4, xmm4, 8
|
||||
movq qword ptr [edx + esi], xmm4
|
||||
lea edx, [edx + 2 * esi]
|
||||
punpckldq xmm2, xmm6
|
||||
movdqa xmm6, xmm2
|
||||
palignr xmm6, xmm6, 8
|
||||
movq qword ptr [edx], xmm2
|
||||
punpckldq xmm1, xmm5
|
||||
movq qword ptr [edx + esi], xmm6
|
||||
lea edx, [edx + 2 * esi]
|
||||
movdqa xmm5, xmm1
|
||||
movq qword ptr [edx], xmm1
|
||||
palignr xmm5, xmm5, 8
|
||||
punpckldq xmm3, xmm7
|
||||
movq qword ptr [edx + esi], xmm5
|
||||
lea edx, [edx + 2 * esi]
|
||||
movq qword ptr [edx], xmm3
|
||||
movdqa xmm7, xmm3
|
||||
palignr xmm7, xmm7, 8
|
||||
sub ecx, 8
|
||||
movq qword ptr [edx + esi], xmm7
|
||||
lea edx, [edx + 2 * esi]
|
||||
jg convertloop
|
||||
|
||||
pop ebp
|
||||
pop esi
|
||||
pop edi
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
#define HAS_TRANSPOSE_UVWX8_SSE2
|
||||
__declspec(naked) __declspec(align(16))
|
||||
static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
|
||||
uint8* dst_a, int dst_stride_a,
|
||||
uint8* dst_b, int dst_stride_b,
|
||||
int w) {
|
||||
__asm {
|
||||
push ebx
|
||||
push esi
|
||||
push edi
|
||||
push ebp
|
||||
mov eax, [esp + 16 + 4] // src
|
||||
mov edi, [esp + 16 + 8] // src_stride
|
||||
mov edx, [esp + 16 + 12] // dst_a
|
||||
mov esi, [esp + 16 + 16] // dst_stride_a
|
||||
mov ebx, [esp + 16 + 20] // dst_b
|
||||
mov ebp, [esp + 16 + 24] // dst_stride_b
|
||||
mov ecx, esp
|
||||
sub esp, 4 + 16
|
||||
and esp, ~15
|
||||
mov [esp + 16], ecx
|
||||
mov ecx, [ecx + 16 + 28] // w
|
||||
|
||||
align 4
|
||||
convertloop:
|
||||
// Read in the data from the source pointer.
|
||||
// First round of bit swap.
|
||||
movdqu xmm0, [eax]
|
||||
movdqu xmm1, [eax + edi]
|
||||
lea eax, [eax + 2 * edi]
|
||||
movdqa xmm7, xmm0 // use xmm7 as temp register.
|
||||
punpcklbw xmm0, xmm1
|
||||
punpckhbw xmm7, xmm1
|
||||
movdqa xmm1, xmm7
|
||||
movdqu xmm2, [eax]
|
||||
movdqu xmm3, [eax + edi]
|
||||
lea eax, [eax + 2 * edi]
|
||||
movdqa xmm7, xmm2
|
||||
punpcklbw xmm2, xmm3
|
||||
punpckhbw xmm7, xmm3
|
||||
movdqa xmm3, xmm7
|
||||
movdqu xmm4, [eax]
|
||||
movdqu xmm5, [eax + edi]
|
||||
lea eax, [eax + 2 * edi]
|
||||
movdqa xmm7, xmm4
|
||||
punpcklbw xmm4, xmm5
|
||||
punpckhbw xmm7, xmm5
|
||||
movdqa xmm5, xmm7
|
||||
movdqu xmm6, [eax]
|
||||
movdqu xmm7, [eax + edi]
|
||||
lea eax, [eax + 2 * edi]
|
||||
movdqu [esp], xmm5 // backup xmm5
|
||||
neg edi
|
||||
movdqa xmm5, xmm6 // use xmm5 as temp register.
|
||||
punpcklbw xmm6, xmm7
|
||||
punpckhbw xmm5, xmm7
|
||||
movdqa xmm7, xmm5
|
||||
lea eax, [eax + 8 * edi + 16]
|
||||
neg edi
|
||||
// Second round of bit swap.
|
||||
movdqa xmm5, xmm0
|
||||
punpcklwd xmm0, xmm2
|
||||
punpckhwd xmm5, xmm2
|
||||
movdqa xmm2, xmm5
|
||||
movdqa xmm5, xmm1
|
||||
punpcklwd xmm1, xmm3
|
||||
punpckhwd xmm5, xmm3
|
||||
movdqa xmm3, xmm5
|
||||
movdqa xmm5, xmm4
|
||||
punpcklwd xmm4, xmm6
|
||||
punpckhwd xmm5, xmm6
|
||||
movdqa xmm6, xmm5
|
||||
movdqu xmm5, [esp] // restore xmm5
|
||||
movdqu [esp], xmm6 // backup xmm6
|
||||
movdqa xmm6, xmm5 // use xmm6 as temp register.
|
||||
punpcklwd xmm5, xmm7
|
||||
punpckhwd xmm6, xmm7
|
||||
movdqa xmm7, xmm6
|
||||
// Third round of bit swap.
|
||||
// Write to the destination pointer.
|
||||
movdqa xmm6, xmm0
|
||||
punpckldq xmm0, xmm4
|
||||
punpckhdq xmm6, xmm4
|
||||
movdqa xmm4, xmm6
|
||||
movdqu xmm6, [esp] // restore xmm6
|
||||
movlpd qword ptr [edx], xmm0
|
||||
movhpd qword ptr [ebx], xmm0
|
||||
movlpd qword ptr [edx + esi], xmm4
|
||||
lea edx, [edx + 2 * esi]
|
||||
movhpd qword ptr [ebx + ebp], xmm4
|
||||
lea ebx, [ebx + 2 * ebp]
|
||||
movdqa xmm0, xmm2 // use xmm0 as the temp register.
|
||||
punpckldq xmm2, xmm6
|
||||
movlpd qword ptr [edx], xmm2
|
||||
movhpd qword ptr [ebx], xmm2
|
||||
punpckhdq xmm0, xmm6
|
||||
movlpd qword ptr [edx + esi], xmm0
|
||||
lea edx, [edx + 2 * esi]
|
||||
movhpd qword ptr [ebx + ebp], xmm0
|
||||
lea ebx, [ebx + 2 * ebp]
|
||||
movdqa xmm0, xmm1 // use xmm0 as the temp register.
|
||||
punpckldq xmm1, xmm5
|
||||
movlpd qword ptr [edx], xmm1
|
||||
movhpd qword ptr [ebx], xmm1
|
||||
punpckhdq xmm0, xmm5
|
||||
movlpd qword ptr [edx + esi], xmm0
|
||||
lea edx, [edx + 2 * esi]
|
||||
movhpd qword ptr [ebx + ebp], xmm0
|
||||
lea ebx, [ebx + 2 * ebp]
|
||||
movdqa xmm0, xmm3 // use xmm0 as the temp register.
|
||||
punpckldq xmm3, xmm7
|
||||
movlpd qword ptr [edx], xmm3
|
||||
movhpd qword ptr [ebx], xmm3
|
||||
punpckhdq xmm0, xmm7
|
||||
sub ecx, 8
|
||||
movlpd qword ptr [edx + esi], xmm0
|
||||
lea edx, [edx + 2 * esi]
|
||||
movhpd qword ptr [ebx + ebp], xmm0
|
||||
lea ebx, [ebx + 2 * ebp]
|
||||
jg convertloop
|
||||
|
||||
mov esp, [esp + 16]
|
||||
pop ebp
|
||||
pop edi
|
||||
pop esi
|
||||
pop ebx
|
||||
ret
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if !defined(LIBYUV_DISABLE_X86) && \
|
||||
(defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
|
||||
#define HAS_TRANSPOSE_WX8_SSSE3
|
||||
static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride, int width) {
|
||||
asm volatile (
|
||||
// Read in the data from the source pointer.
|
||||
// First round of bit swap.
|
||||
".p2align 2 \n"
|
||||
"1: \n"
|
||||
"movq (%0),%%xmm0 \n"
|
||||
"movq (%0,%3),%%xmm1 \n"
|
||||
"lea (%0,%3,2),%0 \n"
|
||||
"punpcklbw %%xmm1,%%xmm0 \n"
|
||||
"movq (%0),%%xmm2 \n"
|
||||
"movdqa %%xmm0,%%xmm1 \n"
|
||||
"palignr $0x8,%%xmm1,%%xmm1 \n"
|
||||
"movq (%0,%3),%%xmm3 \n"
|
||||
"lea (%0,%3,2),%0 \n"
|
||||
"punpcklbw %%xmm3,%%xmm2 \n"
|
||||
"movdqa %%xmm2,%%xmm3 \n"
|
||||
"movq (%0),%%xmm4 \n"
|
||||
"palignr $0x8,%%xmm3,%%xmm3 \n"
|
||||
"movq (%0,%3),%%xmm5 \n"
|
||||
"lea (%0,%3,2),%0 \n"
|
||||
"punpcklbw %%xmm5,%%xmm4 \n"
|
||||
"movdqa %%xmm4,%%xmm5 \n"
|
||||
"movq (%0),%%xmm6 \n"
|
||||
"palignr $0x8,%%xmm5,%%xmm5 \n"
|
||||
"movq (%0,%3),%%xmm7 \n"
|
||||
"lea (%0,%3,2),%0 \n"
|
||||
"punpcklbw %%xmm7,%%xmm6 \n"
|
||||
"neg %3 \n"
|
||||
"movdqa %%xmm6,%%xmm7 \n"
|
||||
"lea 0x8(%0,%3,8),%0 \n"
|
||||
"palignr $0x8,%%xmm7,%%xmm7 \n"
|
||||
"neg %3 \n"
|
||||
// Second round of bit swap.
|
||||
"punpcklwd %%xmm2,%%xmm0 \n"
|
||||
"punpcklwd %%xmm3,%%xmm1 \n"
|
||||
"movdqa %%xmm0,%%xmm2 \n"
|
||||
"movdqa %%xmm1,%%xmm3 \n"
|
||||
"palignr $0x8,%%xmm2,%%xmm2 \n"
|
||||
"palignr $0x8,%%xmm3,%%xmm3 \n"
|
||||
"punpcklwd %%xmm6,%%xmm4 \n"
|
||||
"punpcklwd %%xmm7,%%xmm5 \n"
|
||||
"movdqa %%xmm4,%%xmm6 \n"
|
||||
"movdqa %%xmm5,%%xmm7 \n"
|
||||
"palignr $0x8,%%xmm6,%%xmm6 \n"
|
||||
"palignr $0x8,%%xmm7,%%xmm7 \n"
|
||||
// Third round of bit swap.
|
||||
// Write to the destination pointer.
|
||||
"punpckldq %%xmm4,%%xmm0 \n"
|
||||
"movq %%xmm0,(%1) \n"
|
||||
"movdqa %%xmm0,%%xmm4 \n"
|
||||
"palignr $0x8,%%xmm4,%%xmm4 \n"
|
||||
"movq %%xmm4,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"punpckldq %%xmm6,%%xmm2 \n"
|
||||
"movdqa %%xmm2,%%xmm6 \n"
|
||||
"movq %%xmm2,(%1) \n"
|
||||
"palignr $0x8,%%xmm6,%%xmm6 \n"
|
||||
"punpckldq %%xmm5,%%xmm1 \n"
|
||||
"movq %%xmm6,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"movdqa %%xmm1,%%xmm5 \n"
|
||||
"movq %%xmm1,(%1) \n"
|
||||
"palignr $0x8,%%xmm5,%%xmm5 \n"
|
||||
"movq %%xmm5,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"punpckldq %%xmm7,%%xmm3 \n"
|
||||
"movq %%xmm3,(%1) \n"
|
||||
"movdqa %%xmm3,%%xmm7 \n"
|
||||
"palignr $0x8,%%xmm7,%%xmm7 \n"
|
||||
"sub $0x8,%2 \n"
|
||||
"movq %%xmm7,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(width) // %2
|
||||
: "r"((intptr_t)(src_stride)), // %3
|
||||
"r"((intptr_t)(dst_stride)) // %4
|
||||
: "memory", "cc",
|
||||
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
|
||||
);
|
||||
}
|
||||
|
||||
#if !defined(LIBYUV_DISABLE_X86) && defined(__i386__)
|
||||
#define HAS_TRANSPOSE_UVWX8_SSE2
|
||||
void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
|
||||
uint8* dst_a, int dst_stride_a,
|
||||
uint8* dst_b, int dst_stride_b,
|
||||
int w);
|
||||
asm (
|
||||
DECLARE_FUNCTION(TransposeUVWx8_SSE2)
|
||||
"push %ebx \n"
|
||||
"push %esi \n"
|
||||
"push %edi \n"
|
||||
"push %ebp \n"
|
||||
"mov 0x14(%esp),%eax \n"
|
||||
"mov 0x18(%esp),%edi \n"
|
||||
"mov 0x1c(%esp),%edx \n"
|
||||
"mov 0x20(%esp),%esi \n"
|
||||
"mov 0x24(%esp),%ebx \n"
|
||||
"mov 0x28(%esp),%ebp \n"
|
||||
"mov %esp,%ecx \n"
|
||||
"sub $0x14,%esp \n"
|
||||
"and $0xfffffff0,%esp \n"
|
||||
"mov %ecx,0x10(%esp) \n"
|
||||
"mov 0x2c(%ecx),%ecx \n"
|
||||
|
||||
"1: \n"
|
||||
"movdqu (%eax),%xmm0 \n"
|
||||
"movdqu (%eax,%edi,1),%xmm1 \n"
|
||||
"lea (%eax,%edi,2),%eax \n"
|
||||
"movdqa %xmm0,%xmm7 \n"
|
||||
"punpcklbw %xmm1,%xmm0 \n"
|
||||
"punpckhbw %xmm1,%xmm7 \n"
|
||||
"movdqa %xmm7,%xmm1 \n"
|
||||
"movdqu (%eax),%xmm2 \n"
|
||||
"movdqu (%eax,%edi,1),%xmm3 \n"
|
||||
"lea (%eax,%edi,2),%eax \n"
|
||||
"movdqa %xmm2,%xmm7 \n"
|
||||
"punpcklbw %xmm3,%xmm2 \n"
|
||||
"punpckhbw %xmm3,%xmm7 \n"
|
||||
"movdqa %xmm7,%xmm3 \n"
|
||||
"movdqu (%eax),%xmm4 \n"
|
||||
"movdqu (%eax,%edi,1),%xmm5 \n"
|
||||
"lea (%eax,%edi,2),%eax \n"
|
||||
"movdqa %xmm4,%xmm7 \n"
|
||||
"punpcklbw %xmm5,%xmm4 \n"
|
||||
"punpckhbw %xmm5,%xmm7 \n"
|
||||
"movdqa %xmm7,%xmm5 \n"
|
||||
"movdqu (%eax),%xmm6 \n"
|
||||
"movdqu (%eax,%edi,1),%xmm7 \n"
|
||||
"lea (%eax,%edi,2),%eax \n"
|
||||
"movdqu %xmm5,(%esp) \n"
|
||||
"neg %edi \n"
|
||||
"movdqa %xmm6,%xmm5 \n"
|
||||
"punpcklbw %xmm7,%xmm6 \n"
|
||||
"punpckhbw %xmm7,%xmm5 \n"
|
||||
"movdqa %xmm5,%xmm7 \n"
|
||||
"lea 0x10(%eax,%edi,8),%eax \n"
|
||||
"neg %edi \n"
|
||||
"movdqa %xmm0,%xmm5 \n"
|
||||
"punpcklwd %xmm2,%xmm0 \n"
|
||||
"punpckhwd %xmm2,%xmm5 \n"
|
||||
"movdqa %xmm5,%xmm2 \n"
|
||||
"movdqa %xmm1,%xmm5 \n"
|
||||
"punpcklwd %xmm3,%xmm1 \n"
|
||||
"punpckhwd %xmm3,%xmm5 \n"
|
||||
"movdqa %xmm5,%xmm3 \n"
|
||||
"movdqa %xmm4,%xmm5 \n"
|
||||
"punpcklwd %xmm6,%xmm4 \n"
|
||||
"punpckhwd %xmm6,%xmm5 \n"
|
||||
"movdqa %xmm5,%xmm6 \n"
|
||||
"movdqu (%esp),%xmm5 \n"
|
||||
"movdqu %xmm6,(%esp) \n"
|
||||
"movdqa %xmm5,%xmm6 \n"
|
||||
"punpcklwd %xmm7,%xmm5 \n"
|
||||
"punpckhwd %xmm7,%xmm6 \n"
|
||||
"movdqa %xmm6,%xmm7 \n"
|
||||
"movdqa %xmm0,%xmm6 \n"
|
||||
"punpckldq %xmm4,%xmm0 \n"
|
||||
"punpckhdq %xmm4,%xmm6 \n"
|
||||
"movdqa %xmm6,%xmm4 \n"
|
||||
"movdqu (%esp),%xmm6 \n"
|
||||
"movlpd %xmm0,(%edx) \n"
|
||||
"movhpd %xmm0,(%ebx) \n"
|
||||
"movlpd %xmm4,(%edx,%esi,1) \n"
|
||||
"lea (%edx,%esi,2),%edx \n"
|
||||
"movhpd %xmm4,(%ebx,%ebp,1) \n"
|
||||
"lea (%ebx,%ebp,2),%ebx \n"
|
||||
"movdqa %xmm2,%xmm0 \n"
|
||||
"punpckldq %xmm6,%xmm2 \n"
|
||||
"movlpd %xmm2,(%edx) \n"
|
||||
"movhpd %xmm2,(%ebx) \n"
|
||||
"punpckhdq %xmm6,%xmm0 \n"
|
||||
"movlpd %xmm0,(%edx,%esi,1) \n"
|
||||
"lea (%edx,%esi,2),%edx \n"
|
||||
"movhpd %xmm0,(%ebx,%ebp,1) \n"
|
||||
"lea (%ebx,%ebp,2),%ebx \n"
|
||||
"movdqa %xmm1,%xmm0 \n"
|
||||
"punpckldq %xmm5,%xmm1 \n"
|
||||
"movlpd %xmm1,(%edx) \n"
|
||||
"movhpd %xmm1,(%ebx) \n"
|
||||
"punpckhdq %xmm5,%xmm0 \n"
|
||||
"movlpd %xmm0,(%edx,%esi,1) \n"
|
||||
"lea (%edx,%esi,2),%edx \n"
|
||||
"movhpd %xmm0,(%ebx,%ebp,1) \n"
|
||||
"lea (%ebx,%ebp,2),%ebx \n"
|
||||
"movdqa %xmm3,%xmm0 \n"
|
||||
"punpckldq %xmm7,%xmm3 \n"
|
||||
"movlpd %xmm3,(%edx) \n"
|
||||
"movhpd %xmm3,(%ebx) \n"
|
||||
"punpckhdq %xmm7,%xmm0 \n"
|
||||
"sub $0x8,%ecx \n"
|
||||
"movlpd %xmm0,(%edx,%esi,1) \n"
|
||||
"lea (%edx,%esi,2),%edx \n"
|
||||
"movhpd %xmm0,(%ebx,%ebp,1) \n"
|
||||
"lea (%ebx,%ebp,2),%ebx \n"
|
||||
"jg 1b \n"
|
||||
"mov 0x10(%esp),%esp \n"
|
||||
"pop %ebp \n"
|
||||
"pop %edi \n"
|
||||
"pop %esi \n"
|
||||
"pop %ebx \n"
|
||||
#if defined(__native_client__)
|
||||
"pop %ecx \n"
|
||||
"and $0xffffffe0,%ecx \n"
|
||||
"jmp *%ecx \n"
|
||||
#else
|
||||
"ret \n"
|
||||
#endif
|
||||
);
|
||||
#endif
|
||||
#if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
|
||||
defined(__x86_64__)
|
||||
// 64 bit version has enough registers to do 16x8 to 8x16 at a time.
|
||||
#define HAS_TRANSPOSE_WX8_FAST_SSSE3
|
||||
static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride, int width) {
|
||||
asm volatile (
|
||||
// Read in the data from the source pointer.
|
||||
// First round of bit swap.
|
||||
".p2align 2 \n"
|
||||
"1: \n"
|
||||
"movdqu (%0),%%xmm0 \n"
|
||||
"movdqu (%0,%3),%%xmm1 \n"
|
||||
"lea (%0,%3,2),%0 \n"
|
||||
"movdqa %%xmm0,%%xmm8 \n"
|
||||
"punpcklbw %%xmm1,%%xmm0 \n"
|
||||
"punpckhbw %%xmm1,%%xmm8 \n"
|
||||
"movdqu (%0),%%xmm2 \n"
|
||||
"movdqa %%xmm0,%%xmm1 \n"
|
||||
"movdqa %%xmm8,%%xmm9 \n"
|
||||
"palignr $0x8,%%xmm1,%%xmm1 \n"
|
||||
"palignr $0x8,%%xmm9,%%xmm9 \n"
|
||||
"movdqu (%0,%3),%%xmm3 \n"
|
||||
"lea (%0,%3,2),%0 \n"
|
||||
"movdqa %%xmm2,%%xmm10 \n"
|
||||
"punpcklbw %%xmm3,%%xmm2 \n"
|
||||
"punpckhbw %%xmm3,%%xmm10 \n"
|
||||
"movdqa %%xmm2,%%xmm3 \n"
|
||||
"movdqa %%xmm10,%%xmm11 \n"
|
||||
"movdqu (%0),%%xmm4 \n"
|
||||
"palignr $0x8,%%xmm3,%%xmm3 \n"
|
||||
"palignr $0x8,%%xmm11,%%xmm11 \n"
|
||||
"movdqu (%0,%3),%%xmm5 \n"
|
||||
"lea (%0,%3,2),%0 \n"
|
||||
"movdqa %%xmm4,%%xmm12 \n"
|
||||
"punpcklbw %%xmm5,%%xmm4 \n"
|
||||
"punpckhbw %%xmm5,%%xmm12 \n"
|
||||
"movdqa %%xmm4,%%xmm5 \n"
|
||||
"movdqa %%xmm12,%%xmm13 \n"
|
||||
"movdqu (%0),%%xmm6 \n"
|
||||
"palignr $0x8,%%xmm5,%%xmm5 \n"
|
||||
"palignr $0x8,%%xmm13,%%xmm13 \n"
|
||||
"movdqu (%0,%3),%%xmm7 \n"
|
||||
"lea (%0,%3,2),%0 \n"
|
||||
"movdqa %%xmm6,%%xmm14 \n"
|
||||
"punpcklbw %%xmm7,%%xmm6 \n"
|
||||
"punpckhbw %%xmm7,%%xmm14 \n"
|
||||
"neg %3 \n"
|
||||
"movdqa %%xmm6,%%xmm7 \n"
|
||||
"movdqa %%xmm14,%%xmm15 \n"
|
||||
"lea 0x10(%0,%3,8),%0 \n"
|
||||
"palignr $0x8,%%xmm7,%%xmm7 \n"
|
||||
"palignr $0x8,%%xmm15,%%xmm15 \n"
|
||||
"neg %3 \n"
|
||||
// Second round of bit swap.
|
||||
"punpcklwd %%xmm2,%%xmm0 \n"
|
||||
"punpcklwd %%xmm3,%%xmm1 \n"
|
||||
"movdqa %%xmm0,%%xmm2 \n"
|
||||
"movdqa %%xmm1,%%xmm3 \n"
|
||||
"palignr $0x8,%%xmm2,%%xmm2 \n"
|
||||
"palignr $0x8,%%xmm3,%%xmm3 \n"
|
||||
"punpcklwd %%xmm6,%%xmm4 \n"
|
||||
"punpcklwd %%xmm7,%%xmm5 \n"
|
||||
"movdqa %%xmm4,%%xmm6 \n"
|
||||
"movdqa %%xmm5,%%xmm7 \n"
|
||||
"palignr $0x8,%%xmm6,%%xmm6 \n"
|
||||
"palignr $0x8,%%xmm7,%%xmm7 \n"
|
||||
"punpcklwd %%xmm10,%%xmm8 \n"
|
||||
"punpcklwd %%xmm11,%%xmm9 \n"
|
||||
"movdqa %%xmm8,%%xmm10 \n"
|
||||
"movdqa %%xmm9,%%xmm11 \n"
|
||||
"palignr $0x8,%%xmm10,%%xmm10 \n"
|
||||
"palignr $0x8,%%xmm11,%%xmm11 \n"
|
||||
"punpcklwd %%xmm14,%%xmm12 \n"
|
||||
"punpcklwd %%xmm15,%%xmm13 \n"
|
||||
"movdqa %%xmm12,%%xmm14 \n"
|
||||
"movdqa %%xmm13,%%xmm15 \n"
|
||||
"palignr $0x8,%%xmm14,%%xmm14 \n"
|
||||
"palignr $0x8,%%xmm15,%%xmm15 \n"
|
||||
// Third round of bit swap.
|
||||
// Write to the destination pointer.
|
||||
"punpckldq %%xmm4,%%xmm0 \n"
|
||||
"movq %%xmm0,(%1) \n"
|
||||
"movdqa %%xmm0,%%xmm4 \n"
|
||||
"palignr $0x8,%%xmm4,%%xmm4 \n"
|
||||
"movq %%xmm4,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"punpckldq %%xmm6,%%xmm2 \n"
|
||||
"movdqa %%xmm2,%%xmm6 \n"
|
||||
"movq %%xmm2,(%1) \n"
|
||||
"palignr $0x8,%%xmm6,%%xmm6 \n"
|
||||
"punpckldq %%xmm5,%%xmm1 \n"
|
||||
"movq %%xmm6,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"movdqa %%xmm1,%%xmm5 \n"
|
||||
"movq %%xmm1,(%1) \n"
|
||||
"palignr $0x8,%%xmm5,%%xmm5 \n"
|
||||
"movq %%xmm5,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"punpckldq %%xmm7,%%xmm3 \n"
|
||||
"movq %%xmm3,(%1) \n"
|
||||
"movdqa %%xmm3,%%xmm7 \n"
|
||||
"palignr $0x8,%%xmm7,%%xmm7 \n"
|
||||
"movq %%xmm7,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"punpckldq %%xmm12,%%xmm8 \n"
|
||||
"movq %%xmm8,(%1) \n"
|
||||
"movdqa %%xmm8,%%xmm12 \n"
|
||||
"palignr $0x8,%%xmm12,%%xmm12 \n"
|
||||
"movq %%xmm12,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"punpckldq %%xmm14,%%xmm10 \n"
|
||||
"movdqa %%xmm10,%%xmm14 \n"
|
||||
"movq %%xmm10,(%1) \n"
|
||||
"palignr $0x8,%%xmm14,%%xmm14 \n"
|
||||
"punpckldq %%xmm13,%%xmm9 \n"
|
||||
"movq %%xmm14,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"movdqa %%xmm9,%%xmm13 \n"
|
||||
"movq %%xmm9,(%1) \n"
|
||||
"palignr $0x8,%%xmm13,%%xmm13 \n"
|
||||
"movq %%xmm13,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"punpckldq %%xmm15,%%xmm11 \n"
|
||||
"movq %%xmm11,(%1) \n"
|
||||
"movdqa %%xmm11,%%xmm15 \n"
|
||||
"palignr $0x8,%%xmm15,%%xmm15 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"movq %%xmm15,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(width) // %2
|
||||
: "r"((intptr_t)(src_stride)), // %3
|
||||
"r"((intptr_t)(dst_stride)) // %4
|
||||
: "memory", "cc",
|
||||
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
|
||||
"xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15"
|
||||
);
|
||||
}
|
||||
|
||||
#define HAS_TRANSPOSE_UVWX8_SSE2
|
||||
static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
|
||||
uint8* dst_a, int dst_stride_a,
|
||||
uint8* dst_b, int dst_stride_b,
|
||||
int w) {
|
||||
asm volatile (
|
||||
// Read in the data from the source pointer.
|
||||
// First round of bit swap.
|
||||
".p2align 2 \n"
|
||||
"1: \n"
|
||||
"movdqu (%0),%%xmm0 \n"
|
||||
"movdqu (%0,%4),%%xmm1 \n"
|
||||
"lea (%0,%4,2),%0 \n"
|
||||
"movdqa %%xmm0,%%xmm8 \n"
|
||||
"punpcklbw %%xmm1,%%xmm0 \n"
|
||||
"punpckhbw %%xmm1,%%xmm8 \n"
|
||||
"movdqa %%xmm8,%%xmm1 \n"
|
||||
"movdqu (%0),%%xmm2 \n"
|
||||
"movdqu (%0,%4),%%xmm3 \n"
|
||||
"lea (%0,%4,2),%0 \n"
|
||||
"movdqa %%xmm2,%%xmm8 \n"
|
||||
"punpcklbw %%xmm3,%%xmm2 \n"
|
||||
"punpckhbw %%xmm3,%%xmm8 \n"
|
||||
"movdqa %%xmm8,%%xmm3 \n"
|
||||
"movdqu (%0),%%xmm4 \n"
|
||||
"movdqu (%0,%4),%%xmm5 \n"
|
||||
"lea (%0,%4,2),%0 \n"
|
||||
"movdqa %%xmm4,%%xmm8 \n"
|
||||
"punpcklbw %%xmm5,%%xmm4 \n"
|
||||
"punpckhbw %%xmm5,%%xmm8 \n"
|
||||
"movdqa %%xmm8,%%xmm5 \n"
|
||||
"movdqu (%0),%%xmm6 \n"
|
||||
"movdqu (%0,%4),%%xmm7 \n"
|
||||
"lea (%0,%4,2),%0 \n"
|
||||
"movdqa %%xmm6,%%xmm8 \n"
|
||||
"punpcklbw %%xmm7,%%xmm6 \n"
|
||||
"neg %4 \n"
|
||||
"lea 0x10(%0,%4,8),%0 \n"
|
||||
"punpckhbw %%xmm7,%%xmm8 \n"
|
||||
"movdqa %%xmm8,%%xmm7 \n"
|
||||
"neg %4 \n"
|
||||
// Second round of bit swap.
|
||||
"movdqa %%xmm0,%%xmm8 \n"
|
||||
"movdqa %%xmm1,%%xmm9 \n"
|
||||
"punpckhwd %%xmm2,%%xmm8 \n"
|
||||
"punpckhwd %%xmm3,%%xmm9 \n"
|
||||
"punpcklwd %%xmm2,%%xmm0 \n"
|
||||
"punpcklwd %%xmm3,%%xmm1 \n"
|
||||
"movdqa %%xmm8,%%xmm2 \n"
|
||||
"movdqa %%xmm9,%%xmm3 \n"
|
||||
"movdqa %%xmm4,%%xmm8 \n"
|
||||
"movdqa %%xmm5,%%xmm9 \n"
|
||||
"punpckhwd %%xmm6,%%xmm8 \n"
|
||||
"punpckhwd %%xmm7,%%xmm9 \n"
|
||||
"punpcklwd %%xmm6,%%xmm4 \n"
|
||||
"punpcklwd %%xmm7,%%xmm5 \n"
|
||||
"movdqa %%xmm8,%%xmm6 \n"
|
||||
"movdqa %%xmm9,%%xmm7 \n"
|
||||
// Third round of bit swap.
|
||||
// Write to the destination pointer.
|
||||
"movdqa %%xmm0,%%xmm8 \n"
|
||||
"punpckldq %%xmm4,%%xmm0 \n"
|
||||
"movlpd %%xmm0,(%1) \n" // Write back U channel
|
||||
"movhpd %%xmm0,(%2) \n" // Write back V channel
|
||||
"punpckhdq %%xmm4,%%xmm8 \n"
|
||||
"movlpd %%xmm8,(%1,%5) \n"
|
||||
"lea (%1,%5,2),%1 \n"
|
||||
"movhpd %%xmm8,(%2,%6) \n"
|
||||
"lea (%2,%6,2),%2 \n"
|
||||
"movdqa %%xmm2,%%xmm8 \n"
|
||||
"punpckldq %%xmm6,%%xmm2 \n"
|
||||
"movlpd %%xmm2,(%1) \n"
|
||||
"movhpd %%xmm2,(%2) \n"
|
||||
"punpckhdq %%xmm6,%%xmm8 \n"
|
||||
"movlpd %%xmm8,(%1,%5) \n"
|
||||
"lea (%1,%5,2),%1 \n"
|
||||
"movhpd %%xmm8,(%2,%6) \n"
|
||||
"lea (%2,%6,2),%2 \n"
|
||||
"movdqa %%xmm1,%%xmm8 \n"
|
||||
"punpckldq %%xmm5,%%xmm1 \n"
|
||||
"movlpd %%xmm1,(%1) \n"
|
||||
"movhpd %%xmm1,(%2) \n"
|
||||
"punpckhdq %%xmm5,%%xmm8 \n"
|
||||
"movlpd %%xmm8,(%1,%5) \n"
|
||||
"lea (%1,%5,2),%1 \n"
|
||||
"movhpd %%xmm8,(%2,%6) \n"
|
||||
"lea (%2,%6,2),%2 \n"
|
||||
"movdqa %%xmm3,%%xmm8 \n"
|
||||
"punpckldq %%xmm7,%%xmm3 \n"
|
||||
"movlpd %%xmm3,(%1) \n"
|
||||
"movhpd %%xmm3,(%2) \n"
|
||||
"punpckhdq %%xmm7,%%xmm8 \n"
|
||||
"sub $0x8,%3 \n"
|
||||
"movlpd %%xmm8,(%1,%5) \n"
|
||||
"lea (%1,%5,2),%1 \n"
|
||||
"movhpd %%xmm8,(%2,%6) \n"
|
||||
"lea (%2,%6,2),%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst_a), // %1
|
||||
"+r"(dst_b), // %2
|
||||
"+r"(w) // %3
|
||||
: "r"((intptr_t)(src_stride)), // %4
|
||||
"r"((intptr_t)(dst_stride_a)), // %5
|
||||
"r"((intptr_t)(dst_stride_b)) // %6
|
||||
: "memory", "cc",
|
||||
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
|
||||
"xmm8", "xmm9"
|
||||
);
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
static void TransposeWx8_C(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride,
|
||||
int width) {
|
||||
int i;
|
||||
for (i = 0; i < width; ++i) {
|
||||
dst[0] = src[0 * src_stride];
|
||||
dst[1] = src[1 * src_stride];
|
||||
dst[2] = src[2 * src_stride];
|
||||
dst[3] = src[3 * src_stride];
|
||||
dst[4] = src[4 * src_stride];
|
||||
dst[5] = src[5 * src_stride];
|
||||
dst[6] = src[6 * src_stride];
|
||||
dst[7] = src[7 * src_stride];
|
||||
++src;
|
||||
dst += dst_stride;
|
||||
}
|
||||
}
|
||||
|
||||
static void TransposeWxH_C(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride,
|
||||
int width, int height) {
|
||||
int i;
|
||||
for (i = 0; i < width; ++i) {
|
||||
int j;
|
||||
for (j = 0; j < height; ++j) {
|
||||
dst[i * dst_stride + j] = src[j * src_stride + i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
LIBYUV_API
|
||||
void TransposePlane(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride,
|
||||
int width, int height) {
|
||||
int i = height;
|
||||
void (*TransposeWx8)(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride,
|
||||
int width) = TransposeWx8_C;
|
||||
#if defined(HAS_TRANSPOSE_WX8_NEON)
|
||||
uint8* dst, int dst_stride, int width) = TransposeWx8_C;
|
||||
#if defined(HAS_TRANSPOSEWX8_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
TransposeWx8 = TransposeWx8_NEON;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_TRANSPOSE_WX8_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
|
||||
TransposeWx8 = TransposeWx8_SSSE3;
|
||||
#if defined(HAS_TRANSPOSEWX8_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
TransposeWx8 = TransposeWx8_Any_SSSE3;
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
TransposeWx8 = TransposeWx8_SSSE3;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
|
||||
TransposeWx8 = TransposeWx8_FAST_SSSE3;
|
||||
#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
TransposeWx8 = TransposeWx8_Fast_Any_SSSE3;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
TransposeWx8 = TransposeWx8_Fast_SSSE3;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_TRANSPOSE_WX8_MIPS_DSPR2)
|
||||
#if defined(HAS_TRANSPOSEWX8_MIPS_DSPR2)
|
||||
if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
|
||||
if (IS_ALIGNED(width, 4) &&
|
||||
IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
|
||||
TransposeWx8 = TransposeWx8_FAST_MIPS_DSPR2;
|
||||
TransposeWx8 = TransposeWx8_Fast_MIPS_DSPR2;
|
||||
} else {
|
||||
TransposeWx8 = TransposeWx8_MIPS_DSPR2;
|
||||
}
|
||||
@ -837,7 +68,9 @@ void TransposePlane(const uint8* src, int src_stride,
|
||||
i -= 8;
|
||||
}
|
||||
|
||||
TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
|
||||
if (i > 0) {
|
||||
TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
|
||||
}
|
||||
}
|
||||
|
||||
LIBYUV_API
|
||||
@ -955,48 +188,6 @@ void RotatePlane180(const uint8* src, int src_stride,
|
||||
free_aligned_buffer_64(row);
|
||||
}
|
||||
|
||||
static void TransposeUVWx8_C(const uint8* src, int src_stride,
|
||||
uint8* dst_a, int dst_stride_a,
|
||||
uint8* dst_b, int dst_stride_b,
|
||||
int width) {
|
||||
int i;
|
||||
for (i = 0; i < width; ++i) {
|
||||
dst_a[0] = src[0 * src_stride + 0];
|
||||
dst_b[0] = src[0 * src_stride + 1];
|
||||
dst_a[1] = src[1 * src_stride + 0];
|
||||
dst_b[1] = src[1 * src_stride + 1];
|
||||
dst_a[2] = src[2 * src_stride + 0];
|
||||
dst_b[2] = src[2 * src_stride + 1];
|
||||
dst_a[3] = src[3 * src_stride + 0];
|
||||
dst_b[3] = src[3 * src_stride + 1];
|
||||
dst_a[4] = src[4 * src_stride + 0];
|
||||
dst_b[4] = src[4 * src_stride + 1];
|
||||
dst_a[5] = src[5 * src_stride + 0];
|
||||
dst_b[5] = src[5 * src_stride + 1];
|
||||
dst_a[6] = src[6 * src_stride + 0];
|
||||
dst_b[6] = src[6 * src_stride + 1];
|
||||
dst_a[7] = src[7 * src_stride + 0];
|
||||
dst_b[7] = src[7 * src_stride + 1];
|
||||
src += 2;
|
||||
dst_a += dst_stride_a;
|
||||
dst_b += dst_stride_b;
|
||||
}
|
||||
}
|
||||
|
||||
static void TransposeUVWxH_C(const uint8* src, int src_stride,
|
||||
uint8* dst_a, int dst_stride_a,
|
||||
uint8* dst_b, int dst_stride_b,
|
||||
int width, int height) {
|
||||
int i;
|
||||
for (i = 0; i < width * 2; i += 2) {
|
||||
int j;
|
||||
for (j = 0; j < height; ++j) {
|
||||
dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
|
||||
dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
LIBYUV_API
|
||||
void TransposeUV(const uint8* src, int src_stride,
|
||||
uint8* dst_a, int dst_stride_a,
|
||||
@ -1007,17 +198,17 @@ void TransposeUV(const uint8* src, int src_stride,
|
||||
uint8* dst_a, int dst_stride_a,
|
||||
uint8* dst_b, int dst_stride_b,
|
||||
int width) = TransposeUVWx8_C;
|
||||
#if defined(HAS_TRANSPOSE_UVWX8_NEON)
|
||||
#if defined(HAS_TRANSPOSEUVWX8_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
TransposeUVWx8 = TransposeUVWx8_NEON;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_TRANSPOSE_UVWX8_SSE2)
|
||||
#if defined(HAS_TRANSPOSEUVWX8_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) {
|
||||
TransposeUVWx8 = TransposeUVWx8_SSE2;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_TRANSPOSE_UVWx8_MIPS_DSPR2)
|
||||
#if defined(HAS_TRANSPOSEUVWx8_MIPS_DSPR2)
|
||||
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 2) &&
|
||||
IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
|
||||
TransposeUVWx8 = TransposeUVWx8_MIPS_DSPR2;
|
||||
@ -1036,10 +227,12 @@ void TransposeUV(const uint8* src, int src_stride,
|
||||
i -= 8;
|
||||
}
|
||||
|
||||
TransposeUVWxH_C(src, src_stride,
|
||||
dst_a, dst_stride_a,
|
||||
dst_b, dst_stride_b,
|
||||
width, i);
|
||||
if (i > 0) {
|
||||
TransposeUVWxH_C(src, src_stride,
|
||||
dst_a, dst_stride_a,
|
||||
dst_b, dst_stride_b,
|
||||
width, i);
|
||||
}
|
||||
}
|
||||
|
||||
LIBYUV_API
|
||||
|
55
third_party/libyuv/source/rotate_any.cc
vendored
Normal file
55
third_party/libyuv/source/rotate_any.cc
vendored
Normal file
@ -0,0 +1,55 @@
|
||||
/*
|
||||
* Copyright 2015 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/rotate.h"
|
||||
#include "libyuv/rotate_row.h"
|
||||
|
||||
#include "libyuv/basic_types.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define TANY(NAMEANY, TPOS_SIMD, TPOS_C, MASK) \
|
||||
void NAMEANY(const uint8* src, int src_stride, \
|
||||
uint8* dst, int dst_stride, int width) { \
|
||||
int r = width & MASK; \
|
||||
int n = width - r; \
|
||||
if (n > 0) { \
|
||||
TPOS_SIMD(src, src_stride, dst, dst_stride, n); \
|
||||
} \
|
||||
TPOS_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r); \
|
||||
}
|
||||
|
||||
#ifdef HAS_TRANSPOSEWX8_NEON
|
||||
TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, TransposeWx8_C, 7)
|
||||
#endif
|
||||
#ifdef HAS_TRANSPOSEWX8_SSSE3
|
||||
TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, TransposeWx8_C, 7)
|
||||
#endif
|
||||
#ifdef HAS_TRANSPOSEWX8_FAST_SSSE3
|
||||
TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, TransposeWx8_C, 15)
|
||||
#endif
|
||||
#ifdef HAS_TRANSPOSEWX8_MIPS_DSPR2
|
||||
TANY(TransposeWx8_Any_MIPS_DSPR2, TransposeWx8_MIPS_DSPR2, TransposeWx8_C, 7)
|
||||
#endif
|
||||
|
||||
#undef TANY
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
|
||||
|
24
third_party/libyuv/source/rotate_argb.cc
vendored
24
third_party/libyuv/source/rotate_argb.cc
vendored
@ -27,24 +27,20 @@ extern "C" {
|
||||
(defined(__x86_64__) && !defined(__native_client__)) || defined(__i386__))
|
||||
#define HAS_SCALEARGBROWDOWNEVEN_SSE2
|
||||
void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, int src_stride,
|
||||
int src_stepx,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
int src_stepx, uint8* dst_ptr, int dst_width);
|
||||
#endif
|
||||
#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
|
||||
(defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
|
||||
#define HAS_SCALEARGBROWDOWNEVEN_NEON
|
||||
void ScaleARGBRowDownEven_NEON(const uint8* src_ptr, int src_stride,
|
||||
int src_stepx,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
int src_stepx, uint8* dst_ptr, int dst_width);
|
||||
#endif
|
||||
|
||||
void ScaleARGBRowDownEven_C(const uint8* src_ptr, int,
|
||||
int src_stepx,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
int src_stepx, uint8* dst_ptr, int dst_width);
|
||||
|
||||
static void ARGBTranspose(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride,
|
||||
int width, int height) {
|
||||
uint8* dst, int dst_stride, int width, int height) {
|
||||
int i;
|
||||
int src_pixel_step = src_stride >> 2;
|
||||
void (*ScaleARGBRowDownEven)(const uint8* src_ptr, int src_stride,
|
||||
@ -68,8 +64,7 @@ static void ARGBTranspose(const uint8* src, int src_stride,
|
||||
}
|
||||
|
||||
void ARGBRotate90(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride,
|
||||
int width, int height) {
|
||||
uint8* dst, int dst_stride, int width, int height) {
|
||||
// Rotate by 90 is a ARGBTranspose with the source read
|
||||
// from bottom to top. So set the source pointer to the end
|
||||
// of the buffer and flip the sign of the source stride.
|
||||
@ -79,8 +74,7 @@ void ARGBRotate90(const uint8* src, int src_stride,
|
||||
}
|
||||
|
||||
void ARGBRotate270(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride,
|
||||
int width, int height) {
|
||||
uint8* dst, int dst_stride, int width, int height) {
|
||||
// Rotate by 270 is a ARGBTranspose with the destination written
|
||||
// from bottom to top. So set the destination pointer to the end
|
||||
// of the buffer and flip the sign of the destination stride.
|
||||
@ -90,8 +84,7 @@ void ARGBRotate270(const uint8* src, int src_stride,
|
||||
}
|
||||
|
||||
void ARGBRotate180(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride,
|
||||
int width, int height) {
|
||||
uint8* dst, int dst_stride, int width, int height) {
|
||||
// Swap first and last row and mirror the content. Uses a temporary row.
|
||||
align_buffer_64(row, width * 4);
|
||||
const uint8* src_bot = src + src_stride * (height - 1);
|
||||
@ -166,8 +159,7 @@ void ARGBRotate180(const uint8* src, int src_stride,
|
||||
|
||||
LIBYUV_API
|
||||
int ARGBRotate(const uint8* src_argb, int src_stride_argb,
|
||||
uint8* dst_argb, int dst_stride_argb,
|
||||
int width, int height,
|
||||
uint8* dst_argb, int dst_stride_argb, int width, int height,
|
||||
enum RotationMode mode) {
|
||||
if (!src_argb || width <= 0 || height == 0 || !dst_argb) {
|
||||
return -1;
|
||||
|
92
third_party/libyuv/source/rotate_common.cc
vendored
Normal file
92
third_party/libyuv/source/rotate_common.cc
vendored
Normal file
@ -0,0 +1,92 @@
|
||||
/*
|
||||
* Copyright 2011 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/row.h"
|
||||
#include "libyuv/rotate_row.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
void TransposeWx8_C(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride, int width) {
|
||||
int i;
|
||||
for (i = 0; i < width; ++i) {
|
||||
dst[0] = src[0 * src_stride];
|
||||
dst[1] = src[1 * src_stride];
|
||||
dst[2] = src[2 * src_stride];
|
||||
dst[3] = src[3 * src_stride];
|
||||
dst[4] = src[4 * src_stride];
|
||||
dst[5] = src[5 * src_stride];
|
||||
dst[6] = src[6 * src_stride];
|
||||
dst[7] = src[7 * src_stride];
|
||||
++src;
|
||||
dst += dst_stride;
|
||||
}
|
||||
}
|
||||
|
||||
void TransposeUVWx8_C(const uint8* src, int src_stride,
|
||||
uint8* dst_a, int dst_stride_a,
|
||||
uint8* dst_b, int dst_stride_b, int width) {
|
||||
int i;
|
||||
for (i = 0; i < width; ++i) {
|
||||
dst_a[0] = src[0 * src_stride + 0];
|
||||
dst_b[0] = src[0 * src_stride + 1];
|
||||
dst_a[1] = src[1 * src_stride + 0];
|
||||
dst_b[1] = src[1 * src_stride + 1];
|
||||
dst_a[2] = src[2 * src_stride + 0];
|
||||
dst_b[2] = src[2 * src_stride + 1];
|
||||
dst_a[3] = src[3 * src_stride + 0];
|
||||
dst_b[3] = src[3 * src_stride + 1];
|
||||
dst_a[4] = src[4 * src_stride + 0];
|
||||
dst_b[4] = src[4 * src_stride + 1];
|
||||
dst_a[5] = src[5 * src_stride + 0];
|
||||
dst_b[5] = src[5 * src_stride + 1];
|
||||
dst_a[6] = src[6 * src_stride + 0];
|
||||
dst_b[6] = src[6 * src_stride + 1];
|
||||
dst_a[7] = src[7 * src_stride + 0];
|
||||
dst_b[7] = src[7 * src_stride + 1];
|
||||
src += 2;
|
||||
dst_a += dst_stride_a;
|
||||
dst_b += dst_stride_b;
|
||||
}
|
||||
}
|
||||
|
||||
void TransposeWxH_C(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride,
|
||||
int width, int height) {
|
||||
int i;
|
||||
for (i = 0; i < width; ++i) {
|
||||
int j;
|
||||
for (j = 0; j < height; ++j) {
|
||||
dst[i * dst_stride + j] = src[j * src_stride + i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void TransposeUVWxH_C(const uint8* src, int src_stride,
|
||||
uint8* dst_a, int dst_stride_a,
|
||||
uint8* dst_b, int dst_stride_b,
|
||||
int width, int height) {
|
||||
int i;
|
||||
for (i = 0; i < width * 2; i += 2) {
|
||||
int j;
|
||||
for (j = 0; j < height; ++j) {
|
||||
dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
|
||||
dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
493
third_party/libyuv/source/rotate_gcc.cc
vendored
Normal file
493
third_party/libyuv/source/rotate_gcc.cc
vendored
Normal file
@ -0,0 +1,493 @@
|
||||
/*
|
||||
* Copyright 2015 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/row.h"
|
||||
#include "libyuv/rotate_row.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// This module is for GCC x86 and x64.
|
||||
#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
|
||||
|
||||
#if !defined(LIBYUV_DISABLE_X86) && \
|
||||
(defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
|
||||
void TransposeWx8_SSSE3(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride, int width) {
|
||||
asm volatile (
|
||||
// Read in the data from the source pointer.
|
||||
// First round of bit swap.
|
||||
".p2align 2 \n"
|
||||
"1: \n"
|
||||
"movq (%0),%%xmm0 \n"
|
||||
"movq (%0,%3),%%xmm1 \n"
|
||||
"lea (%0,%3,2),%0 \n"
|
||||
"punpcklbw %%xmm1,%%xmm0 \n"
|
||||
"movq (%0),%%xmm2 \n"
|
||||
"movdqa %%xmm0,%%xmm1 \n"
|
||||
"palignr $0x8,%%xmm1,%%xmm1 \n"
|
||||
"movq (%0,%3),%%xmm3 \n"
|
||||
"lea (%0,%3,2),%0 \n"
|
||||
"punpcklbw %%xmm3,%%xmm2 \n"
|
||||
"movdqa %%xmm2,%%xmm3 \n"
|
||||
"movq (%0),%%xmm4 \n"
|
||||
"palignr $0x8,%%xmm3,%%xmm3 \n"
|
||||
"movq (%0,%3),%%xmm5 \n"
|
||||
"lea (%0,%3,2),%0 \n"
|
||||
"punpcklbw %%xmm5,%%xmm4 \n"
|
||||
"movdqa %%xmm4,%%xmm5 \n"
|
||||
"movq (%0),%%xmm6 \n"
|
||||
"palignr $0x8,%%xmm5,%%xmm5 \n"
|
||||
"movq (%0,%3),%%xmm7 \n"
|
||||
"lea (%0,%3,2),%0 \n"
|
||||
"punpcklbw %%xmm7,%%xmm6 \n"
|
||||
"neg %3 \n"
|
||||
"movdqa %%xmm6,%%xmm7 \n"
|
||||
"lea 0x8(%0,%3,8),%0 \n"
|
||||
"palignr $0x8,%%xmm7,%%xmm7 \n"
|
||||
"neg %3 \n"
|
||||
// Second round of bit swap.
|
||||
"punpcklwd %%xmm2,%%xmm0 \n"
|
||||
"punpcklwd %%xmm3,%%xmm1 \n"
|
||||
"movdqa %%xmm0,%%xmm2 \n"
|
||||
"movdqa %%xmm1,%%xmm3 \n"
|
||||
"palignr $0x8,%%xmm2,%%xmm2 \n"
|
||||
"palignr $0x8,%%xmm3,%%xmm3 \n"
|
||||
"punpcklwd %%xmm6,%%xmm4 \n"
|
||||
"punpcklwd %%xmm7,%%xmm5 \n"
|
||||
"movdqa %%xmm4,%%xmm6 \n"
|
||||
"movdqa %%xmm5,%%xmm7 \n"
|
||||
"palignr $0x8,%%xmm6,%%xmm6 \n"
|
||||
"palignr $0x8,%%xmm7,%%xmm7 \n"
|
||||
// Third round of bit swap.
|
||||
// Write to the destination pointer.
|
||||
"punpckldq %%xmm4,%%xmm0 \n"
|
||||
"movq %%xmm0,(%1) \n"
|
||||
"movdqa %%xmm0,%%xmm4 \n"
|
||||
"palignr $0x8,%%xmm4,%%xmm4 \n"
|
||||
"movq %%xmm4,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"punpckldq %%xmm6,%%xmm2 \n"
|
||||
"movdqa %%xmm2,%%xmm6 \n"
|
||||
"movq %%xmm2,(%1) \n"
|
||||
"palignr $0x8,%%xmm6,%%xmm6 \n"
|
||||
"punpckldq %%xmm5,%%xmm1 \n"
|
||||
"movq %%xmm6,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"movdqa %%xmm1,%%xmm5 \n"
|
||||
"movq %%xmm1,(%1) \n"
|
||||
"palignr $0x8,%%xmm5,%%xmm5 \n"
|
||||
"movq %%xmm5,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"punpckldq %%xmm7,%%xmm3 \n"
|
||||
"movq %%xmm3,(%1) \n"
|
||||
"movdqa %%xmm3,%%xmm7 \n"
|
||||
"palignr $0x8,%%xmm7,%%xmm7 \n"
|
||||
"sub $0x8,%2 \n"
|
||||
"movq %%xmm7,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(width) // %2
|
||||
: "r"((intptr_t)(src_stride)), // %3
|
||||
"r"((intptr_t)(dst_stride)) // %4
|
||||
: "memory", "cc",
|
||||
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
|
||||
);
|
||||
}
|
||||
|
||||
#if !defined(LIBYUV_DISABLE_X86) && defined(__i386__) && !defined(__clang__)
|
||||
void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
|
||||
uint8* dst_a, int dst_stride_a,
|
||||
uint8* dst_b, int dst_stride_b, int width);
|
||||
asm (
|
||||
DECLARE_FUNCTION(TransposeUVWx8_SSE2)
|
||||
"push %ebx \n"
|
||||
"push %esi \n"
|
||||
"push %edi \n"
|
||||
"push %ebp \n"
|
||||
"mov 0x14(%esp),%eax \n"
|
||||
"mov 0x18(%esp),%edi \n"
|
||||
"mov 0x1c(%esp),%edx \n"
|
||||
"mov 0x20(%esp),%esi \n"
|
||||
"mov 0x24(%esp),%ebx \n"
|
||||
"mov 0x28(%esp),%ebp \n"
|
||||
"mov %esp,%ecx \n"
|
||||
"sub $0x14,%esp \n"
|
||||
"and $0xfffffff0,%esp \n"
|
||||
"mov %ecx,0x10(%esp) \n"
|
||||
"mov 0x2c(%ecx),%ecx \n"
|
||||
|
||||
"1: \n"
|
||||
"movdqu (%eax),%xmm0 \n"
|
||||
"movdqu (%eax,%edi,1),%xmm1 \n"
|
||||
"lea (%eax,%edi,2),%eax \n"
|
||||
"movdqa %xmm0,%xmm7 \n"
|
||||
"punpcklbw %xmm1,%xmm0 \n"
|
||||
"punpckhbw %xmm1,%xmm7 \n"
|
||||
"movdqa %xmm7,%xmm1 \n"
|
||||
"movdqu (%eax),%xmm2 \n"
|
||||
"movdqu (%eax,%edi,1),%xmm3 \n"
|
||||
"lea (%eax,%edi,2),%eax \n"
|
||||
"movdqa %xmm2,%xmm7 \n"
|
||||
"punpcklbw %xmm3,%xmm2 \n"
|
||||
"punpckhbw %xmm3,%xmm7 \n"
|
||||
"movdqa %xmm7,%xmm3 \n"
|
||||
"movdqu (%eax),%xmm4 \n"
|
||||
"movdqu (%eax,%edi,1),%xmm5 \n"
|
||||
"lea (%eax,%edi,2),%eax \n"
|
||||
"movdqa %xmm4,%xmm7 \n"
|
||||
"punpcklbw %xmm5,%xmm4 \n"
|
||||
"punpckhbw %xmm5,%xmm7 \n"
|
||||
"movdqa %xmm7,%xmm5 \n"
|
||||
"movdqu (%eax),%xmm6 \n"
|
||||
"movdqu (%eax,%edi,1),%xmm7 \n"
|
||||
"lea (%eax,%edi,2),%eax \n"
|
||||
"movdqu %xmm5,(%esp) \n"
|
||||
"neg %edi \n"
|
||||
"movdqa %xmm6,%xmm5 \n"
|
||||
"punpcklbw %xmm7,%xmm6 \n"
|
||||
"punpckhbw %xmm7,%xmm5 \n"
|
||||
"movdqa %xmm5,%xmm7 \n"
|
||||
"lea 0x10(%eax,%edi,8),%eax \n"
|
||||
"neg %edi \n"
|
||||
"movdqa %xmm0,%xmm5 \n"
|
||||
"punpcklwd %xmm2,%xmm0 \n"
|
||||
"punpckhwd %xmm2,%xmm5 \n"
|
||||
"movdqa %xmm5,%xmm2 \n"
|
||||
"movdqa %xmm1,%xmm5 \n"
|
||||
"punpcklwd %xmm3,%xmm1 \n"
|
||||
"punpckhwd %xmm3,%xmm5 \n"
|
||||
"movdqa %xmm5,%xmm3 \n"
|
||||
"movdqa %xmm4,%xmm5 \n"
|
||||
"punpcklwd %xmm6,%xmm4 \n"
|
||||
"punpckhwd %xmm6,%xmm5 \n"
|
||||
"movdqa %xmm5,%xmm6 \n"
|
||||
"movdqu (%esp),%xmm5 \n"
|
||||
"movdqu %xmm6,(%esp) \n"
|
||||
"movdqa %xmm5,%xmm6 \n"
|
||||
"punpcklwd %xmm7,%xmm5 \n"
|
||||
"punpckhwd %xmm7,%xmm6 \n"
|
||||
"movdqa %xmm6,%xmm7 \n"
|
||||
"movdqa %xmm0,%xmm6 \n"
|
||||
"punpckldq %xmm4,%xmm0 \n"
|
||||
"punpckhdq %xmm4,%xmm6 \n"
|
||||
"movdqa %xmm6,%xmm4 \n"
|
||||
"movdqu (%esp),%xmm6 \n"
|
||||
"movlpd %xmm0,(%edx) \n"
|
||||
"movhpd %xmm0,(%ebx) \n"
|
||||
"movlpd %xmm4,(%edx,%esi,1) \n"
|
||||
"lea (%edx,%esi,2),%edx \n"
|
||||
"movhpd %xmm4,(%ebx,%ebp,1) \n"
|
||||
"lea (%ebx,%ebp,2),%ebx \n"
|
||||
"movdqa %xmm2,%xmm0 \n"
|
||||
"punpckldq %xmm6,%xmm2 \n"
|
||||
"movlpd %xmm2,(%edx) \n"
|
||||
"movhpd %xmm2,(%ebx) \n"
|
||||
"punpckhdq %xmm6,%xmm0 \n"
|
||||
"movlpd %xmm0,(%edx,%esi,1) \n"
|
||||
"lea (%edx,%esi,2),%edx \n"
|
||||
"movhpd %xmm0,(%ebx,%ebp,1) \n"
|
||||
"lea (%ebx,%ebp,2),%ebx \n"
|
||||
"movdqa %xmm1,%xmm0 \n"
|
||||
"punpckldq %xmm5,%xmm1 \n"
|
||||
"movlpd %xmm1,(%edx) \n"
|
||||
"movhpd %xmm1,(%ebx) \n"
|
||||
"punpckhdq %xmm5,%xmm0 \n"
|
||||
"movlpd %xmm0,(%edx,%esi,1) \n"
|
||||
"lea (%edx,%esi,2),%edx \n"
|
||||
"movhpd %xmm0,(%ebx,%ebp,1) \n"
|
||||
"lea (%ebx,%ebp,2),%ebx \n"
|
||||
"movdqa %xmm3,%xmm0 \n"
|
||||
"punpckldq %xmm7,%xmm3 \n"
|
||||
"movlpd %xmm3,(%edx) \n"
|
||||
"movhpd %xmm3,(%ebx) \n"
|
||||
"punpckhdq %xmm7,%xmm0 \n"
|
||||
"sub $0x8,%ecx \n"
|
||||
"movlpd %xmm0,(%edx,%esi,1) \n"
|
||||
"lea (%edx,%esi,2),%edx \n"
|
||||
"movhpd %xmm0,(%ebx,%ebp,1) \n"
|
||||
"lea (%ebx,%ebp,2),%ebx \n"
|
||||
"jg 1b \n"
|
||||
"mov 0x10(%esp),%esp \n"
|
||||
"pop %ebp \n"
|
||||
"pop %edi \n"
|
||||
"pop %esi \n"
|
||||
"pop %ebx \n"
|
||||
#if defined(__native_client__)
|
||||
"pop %ecx \n"
|
||||
"and $0xffffffe0,%ecx \n"
|
||||
"jmp *%ecx \n"
|
||||
#else
|
||||
"ret \n"
|
||||
#endif
|
||||
);
|
||||
#endif
|
||||
#if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
|
||||
defined(__x86_64__)
|
||||
// 64 bit version has enough registers to do 16x8 to 8x16 at a time.
|
||||
void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride, int width) {
|
||||
asm volatile (
|
||||
// Read in the data from the source pointer.
|
||||
// First round of bit swap.
|
||||
".p2align 2 \n"
|
||||
"1: \n"
|
||||
"movdqu (%0),%%xmm0 \n"
|
||||
"movdqu (%0,%3),%%xmm1 \n"
|
||||
"lea (%0,%3,2),%0 \n"
|
||||
"movdqa %%xmm0,%%xmm8 \n"
|
||||
"punpcklbw %%xmm1,%%xmm0 \n"
|
||||
"punpckhbw %%xmm1,%%xmm8 \n"
|
||||
"movdqu (%0),%%xmm2 \n"
|
||||
"movdqa %%xmm0,%%xmm1 \n"
|
||||
"movdqa %%xmm8,%%xmm9 \n"
|
||||
"palignr $0x8,%%xmm1,%%xmm1 \n"
|
||||
"palignr $0x8,%%xmm9,%%xmm9 \n"
|
||||
"movdqu (%0,%3),%%xmm3 \n"
|
||||
"lea (%0,%3,2),%0 \n"
|
||||
"movdqa %%xmm2,%%xmm10 \n"
|
||||
"punpcklbw %%xmm3,%%xmm2 \n"
|
||||
"punpckhbw %%xmm3,%%xmm10 \n"
|
||||
"movdqa %%xmm2,%%xmm3 \n"
|
||||
"movdqa %%xmm10,%%xmm11 \n"
|
||||
"movdqu (%0),%%xmm4 \n"
|
||||
"palignr $0x8,%%xmm3,%%xmm3 \n"
|
||||
"palignr $0x8,%%xmm11,%%xmm11 \n"
|
||||
"movdqu (%0,%3),%%xmm5 \n"
|
||||
"lea (%0,%3,2),%0 \n"
|
||||
"movdqa %%xmm4,%%xmm12 \n"
|
||||
"punpcklbw %%xmm5,%%xmm4 \n"
|
||||
"punpckhbw %%xmm5,%%xmm12 \n"
|
||||
"movdqa %%xmm4,%%xmm5 \n"
|
||||
"movdqa %%xmm12,%%xmm13 \n"
|
||||
"movdqu (%0),%%xmm6 \n"
|
||||
"palignr $0x8,%%xmm5,%%xmm5 \n"
|
||||
"palignr $0x8,%%xmm13,%%xmm13 \n"
|
||||
"movdqu (%0,%3),%%xmm7 \n"
|
||||
"lea (%0,%3,2),%0 \n"
|
||||
"movdqa %%xmm6,%%xmm14 \n"
|
||||
"punpcklbw %%xmm7,%%xmm6 \n"
|
||||
"punpckhbw %%xmm7,%%xmm14 \n"
|
||||
"neg %3 \n"
|
||||
"movdqa %%xmm6,%%xmm7 \n"
|
||||
"movdqa %%xmm14,%%xmm15 \n"
|
||||
"lea 0x10(%0,%3,8),%0 \n"
|
||||
"palignr $0x8,%%xmm7,%%xmm7 \n"
|
||||
"palignr $0x8,%%xmm15,%%xmm15 \n"
|
||||
"neg %3 \n"
|
||||
// Second round of bit swap.
|
||||
"punpcklwd %%xmm2,%%xmm0 \n"
|
||||
"punpcklwd %%xmm3,%%xmm1 \n"
|
||||
"movdqa %%xmm0,%%xmm2 \n"
|
||||
"movdqa %%xmm1,%%xmm3 \n"
|
||||
"palignr $0x8,%%xmm2,%%xmm2 \n"
|
||||
"palignr $0x8,%%xmm3,%%xmm3 \n"
|
||||
"punpcklwd %%xmm6,%%xmm4 \n"
|
||||
"punpcklwd %%xmm7,%%xmm5 \n"
|
||||
"movdqa %%xmm4,%%xmm6 \n"
|
||||
"movdqa %%xmm5,%%xmm7 \n"
|
||||
"palignr $0x8,%%xmm6,%%xmm6 \n"
|
||||
"palignr $0x8,%%xmm7,%%xmm7 \n"
|
||||
"punpcklwd %%xmm10,%%xmm8 \n"
|
||||
"punpcklwd %%xmm11,%%xmm9 \n"
|
||||
"movdqa %%xmm8,%%xmm10 \n"
|
||||
"movdqa %%xmm9,%%xmm11 \n"
|
||||
"palignr $0x8,%%xmm10,%%xmm10 \n"
|
||||
"palignr $0x8,%%xmm11,%%xmm11 \n"
|
||||
"punpcklwd %%xmm14,%%xmm12 \n"
|
||||
"punpcklwd %%xmm15,%%xmm13 \n"
|
||||
"movdqa %%xmm12,%%xmm14 \n"
|
||||
"movdqa %%xmm13,%%xmm15 \n"
|
||||
"palignr $0x8,%%xmm14,%%xmm14 \n"
|
||||
"palignr $0x8,%%xmm15,%%xmm15 \n"
|
||||
// Third round of bit swap.
|
||||
// Write to the destination pointer.
|
||||
"punpckldq %%xmm4,%%xmm0 \n"
|
||||
"movq %%xmm0,(%1) \n"
|
||||
"movdqa %%xmm0,%%xmm4 \n"
|
||||
"palignr $0x8,%%xmm4,%%xmm4 \n"
|
||||
"movq %%xmm4,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"punpckldq %%xmm6,%%xmm2 \n"
|
||||
"movdqa %%xmm2,%%xmm6 \n"
|
||||
"movq %%xmm2,(%1) \n"
|
||||
"palignr $0x8,%%xmm6,%%xmm6 \n"
|
||||
"punpckldq %%xmm5,%%xmm1 \n"
|
||||
"movq %%xmm6,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"movdqa %%xmm1,%%xmm5 \n"
|
||||
"movq %%xmm1,(%1) \n"
|
||||
"palignr $0x8,%%xmm5,%%xmm5 \n"
|
||||
"movq %%xmm5,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"punpckldq %%xmm7,%%xmm3 \n"
|
||||
"movq %%xmm3,(%1) \n"
|
||||
"movdqa %%xmm3,%%xmm7 \n"
|
||||
"palignr $0x8,%%xmm7,%%xmm7 \n"
|
||||
"movq %%xmm7,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"punpckldq %%xmm12,%%xmm8 \n"
|
||||
"movq %%xmm8,(%1) \n"
|
||||
"movdqa %%xmm8,%%xmm12 \n"
|
||||
"palignr $0x8,%%xmm12,%%xmm12 \n"
|
||||
"movq %%xmm12,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"punpckldq %%xmm14,%%xmm10 \n"
|
||||
"movdqa %%xmm10,%%xmm14 \n"
|
||||
"movq %%xmm10,(%1) \n"
|
||||
"palignr $0x8,%%xmm14,%%xmm14 \n"
|
||||
"punpckldq %%xmm13,%%xmm9 \n"
|
||||
"movq %%xmm14,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"movdqa %%xmm9,%%xmm13 \n"
|
||||
"movq %%xmm9,(%1) \n"
|
||||
"palignr $0x8,%%xmm13,%%xmm13 \n"
|
||||
"movq %%xmm13,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"punpckldq %%xmm15,%%xmm11 \n"
|
||||
"movq %%xmm11,(%1) \n"
|
||||
"movdqa %%xmm11,%%xmm15 \n"
|
||||
"palignr $0x8,%%xmm15,%%xmm15 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"movq %%xmm15,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(width) // %2
|
||||
: "r"((intptr_t)(src_stride)), // %3
|
||||
"r"((intptr_t)(dst_stride)) // %4
|
||||
: "memory", "cc",
|
||||
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
|
||||
"xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15"
|
||||
);
|
||||
}
|
||||
|
||||
void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
|
||||
uint8* dst_a, int dst_stride_a,
|
||||
uint8* dst_b, int dst_stride_b, int width) {
|
||||
asm volatile (
|
||||
// Read in the data from the source pointer.
|
||||
// First round of bit swap.
|
||||
".p2align 2 \n"
|
||||
"1: \n"
|
||||
"movdqu (%0),%%xmm0 \n"
|
||||
"movdqu (%0,%4),%%xmm1 \n"
|
||||
"lea (%0,%4,2),%0 \n"
|
||||
"movdqa %%xmm0,%%xmm8 \n"
|
||||
"punpcklbw %%xmm1,%%xmm0 \n"
|
||||
"punpckhbw %%xmm1,%%xmm8 \n"
|
||||
"movdqa %%xmm8,%%xmm1 \n"
|
||||
"movdqu (%0),%%xmm2 \n"
|
||||
"movdqu (%0,%4),%%xmm3 \n"
|
||||
"lea (%0,%4,2),%0 \n"
|
||||
"movdqa %%xmm2,%%xmm8 \n"
|
||||
"punpcklbw %%xmm3,%%xmm2 \n"
|
||||
"punpckhbw %%xmm3,%%xmm8 \n"
|
||||
"movdqa %%xmm8,%%xmm3 \n"
|
||||
"movdqu (%0),%%xmm4 \n"
|
||||
"movdqu (%0,%4),%%xmm5 \n"
|
||||
"lea (%0,%4,2),%0 \n"
|
||||
"movdqa %%xmm4,%%xmm8 \n"
|
||||
"punpcklbw %%xmm5,%%xmm4 \n"
|
||||
"punpckhbw %%xmm5,%%xmm8 \n"
|
||||
"movdqa %%xmm8,%%xmm5 \n"
|
||||
"movdqu (%0),%%xmm6 \n"
|
||||
"movdqu (%0,%4),%%xmm7 \n"
|
||||
"lea (%0,%4,2),%0 \n"
|
||||
"movdqa %%xmm6,%%xmm8 \n"
|
||||
"punpcklbw %%xmm7,%%xmm6 \n"
|
||||
"neg %4 \n"
|
||||
"lea 0x10(%0,%4,8),%0 \n"
|
||||
"punpckhbw %%xmm7,%%xmm8 \n"
|
||||
"movdqa %%xmm8,%%xmm7 \n"
|
||||
"neg %4 \n"
|
||||
// Second round of bit swap.
|
||||
"movdqa %%xmm0,%%xmm8 \n"
|
||||
"movdqa %%xmm1,%%xmm9 \n"
|
||||
"punpckhwd %%xmm2,%%xmm8 \n"
|
||||
"punpckhwd %%xmm3,%%xmm9 \n"
|
||||
"punpcklwd %%xmm2,%%xmm0 \n"
|
||||
"punpcklwd %%xmm3,%%xmm1 \n"
|
||||
"movdqa %%xmm8,%%xmm2 \n"
|
||||
"movdqa %%xmm9,%%xmm3 \n"
|
||||
"movdqa %%xmm4,%%xmm8 \n"
|
||||
"movdqa %%xmm5,%%xmm9 \n"
|
||||
"punpckhwd %%xmm6,%%xmm8 \n"
|
||||
"punpckhwd %%xmm7,%%xmm9 \n"
|
||||
"punpcklwd %%xmm6,%%xmm4 \n"
|
||||
"punpcklwd %%xmm7,%%xmm5 \n"
|
||||
"movdqa %%xmm8,%%xmm6 \n"
|
||||
"movdqa %%xmm9,%%xmm7 \n"
|
||||
// Third round of bit swap.
|
||||
// Write to the destination pointer.
|
||||
"movdqa %%xmm0,%%xmm8 \n"
|
||||
"punpckldq %%xmm4,%%xmm0 \n"
|
||||
"movlpd %%xmm0,(%1) \n" // Write back U channel
|
||||
"movhpd %%xmm0,(%2) \n" // Write back V channel
|
||||
"punpckhdq %%xmm4,%%xmm8 \n"
|
||||
"movlpd %%xmm8,(%1,%5) \n"
|
||||
"lea (%1,%5,2),%1 \n"
|
||||
"movhpd %%xmm8,(%2,%6) \n"
|
||||
"lea (%2,%6,2),%2 \n"
|
||||
"movdqa %%xmm2,%%xmm8 \n"
|
||||
"punpckldq %%xmm6,%%xmm2 \n"
|
||||
"movlpd %%xmm2,(%1) \n"
|
||||
"movhpd %%xmm2,(%2) \n"
|
||||
"punpckhdq %%xmm6,%%xmm8 \n"
|
||||
"movlpd %%xmm8,(%1,%5) \n"
|
||||
"lea (%1,%5,2),%1 \n"
|
||||
"movhpd %%xmm8,(%2,%6) \n"
|
||||
"lea (%2,%6,2),%2 \n"
|
||||
"movdqa %%xmm1,%%xmm8 \n"
|
||||
"punpckldq %%xmm5,%%xmm1 \n"
|
||||
"movlpd %%xmm1,(%1) \n"
|
||||
"movhpd %%xmm1,(%2) \n"
|
||||
"punpckhdq %%xmm5,%%xmm8 \n"
|
||||
"movlpd %%xmm8,(%1,%5) \n"
|
||||
"lea (%1,%5,2),%1 \n"
|
||||
"movhpd %%xmm8,(%2,%6) \n"
|
||||
"lea (%2,%6,2),%2 \n"
|
||||
"movdqa %%xmm3,%%xmm8 \n"
|
||||
"punpckldq %%xmm7,%%xmm3 \n"
|
||||
"movlpd %%xmm3,(%1) \n"
|
||||
"movhpd %%xmm3,(%2) \n"
|
||||
"punpckhdq %%xmm7,%%xmm8 \n"
|
||||
"sub $0x8,%3 \n"
|
||||
"movlpd %%xmm8,(%1,%5) \n"
|
||||
"lea (%1,%5,2),%1 \n"
|
||||
"movhpd %%xmm8,(%2,%6) \n"
|
||||
"lea (%2,%6,2),%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst_a), // %1
|
||||
"+r"(dst_b), // %2
|
||||
"+r"(width) // %3
|
||||
: "r"((intptr_t)(src_stride)), // %4
|
||||
"r"((intptr_t)(dst_stride_a)), // %5
|
||||
"r"((intptr_t)(dst_stride_b)) // %6
|
||||
: "memory", "cc",
|
||||
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
|
||||
"xmm8", "xmm9"
|
||||
);
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#endif // defined(__x86_64__) || defined(__i386__)
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
9
third_party/libyuv/source/rotate_mips.cc
vendored
9
third_party/libyuv/source/rotate_mips.cc
vendored
@ -9,6 +9,7 @@
|
||||
*/
|
||||
|
||||
#include "libyuv/row.h"
|
||||
#include "libyuv/rotate_row.h"
|
||||
|
||||
#include "libyuv/basic_types.h"
|
||||
|
||||
@ -22,8 +23,7 @@ extern "C" {
|
||||
(_MIPS_SIM == _MIPS_SIM_ABI32)
|
||||
|
||||
void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride,
|
||||
int width) {
|
||||
uint8* dst, int dst_stride, int width) {
|
||||
__asm__ __volatile__ (
|
||||
".set push \n"
|
||||
".set noreorder \n"
|
||||
@ -106,9 +106,8 @@ void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
|
||||
);
|
||||
}
|
||||
|
||||
void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride,
|
||||
int width) {
|
||||
void TransposeWx8_Fast_MIPS_DSPR2(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride, int width) {
|
||||
__asm__ __volatile__ (
|
||||
".set noat \n"
|
||||
".set push \n"
|
||||
|
1
third_party/libyuv/source/rotate_neon.cc
vendored
1
third_party/libyuv/source/rotate_neon.cc
vendored
@ -9,6 +9,7 @@
|
||||
*/
|
||||
|
||||
#include "libyuv/row.h"
|
||||
#include "libyuv/rotate_row.h"
|
||||
|
||||
#include "libyuv/basic_types.h"
|
||||
|
||||
|
6
third_party/libyuv/source/rotate_neon64.cc
vendored
6
third_party/libyuv/source/rotate_neon64.cc
vendored
@ -9,6 +9,7 @@
|
||||
*/
|
||||
|
||||
#include "libyuv/row.h"
|
||||
#include "libyuv/rotate_row.h"
|
||||
|
||||
#include "libyuv/basic_types.h"
|
||||
|
||||
@ -21,11 +22,10 @@ extern "C" {
|
||||
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
||||
|
||||
static uvec8 kVTbl4x4Transpose =
|
||||
{ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
|
||||
{ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
|
||||
|
||||
void TransposeWx8_NEON(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride,
|
||||
int width) {
|
||||
uint8* dst, int dst_stride, int width) {
|
||||
const uint8* src_temp = NULL;
|
||||
int64 width64 = (int64) width; // Work around clang 3.4 warning.
|
||||
asm volatile (
|
||||
|
248
third_party/libyuv/source/rotate_win.cc
vendored
Normal file
248
third_party/libyuv/source/rotate_win.cc
vendored
Normal file
@ -0,0 +1,248 @@
|
||||
/*
|
||||
* Copyright 2013 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/row.h"
|
||||
#include "libyuv/rotate_row.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// This module is for Visual C x86.
|
||||
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
|
||||
defined(_MSC_VER) && !defined(__clang__)
|
||||
|
||||
__declspec(naked)
|
||||
void TransposeWx8_SSSE3(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride, int width) {
|
||||
__asm {
|
||||
push edi
|
||||
push esi
|
||||
push ebp
|
||||
mov eax, [esp + 12 + 4] // src
|
||||
mov edi, [esp + 12 + 8] // src_stride
|
||||
mov edx, [esp + 12 + 12] // dst
|
||||
mov esi, [esp + 12 + 16] // dst_stride
|
||||
mov ecx, [esp + 12 + 20] // width
|
||||
|
||||
// Read in the data from the source pointer.
|
||||
// First round of bit swap.
|
||||
align 4
|
||||
convertloop:
|
||||
movq xmm0, qword ptr [eax]
|
||||
lea ebp, [eax + 8]
|
||||
movq xmm1, qword ptr [eax + edi]
|
||||
lea eax, [eax + 2 * edi]
|
||||
punpcklbw xmm0, xmm1
|
||||
movq xmm2, qword ptr [eax]
|
||||
movdqa xmm1, xmm0
|
||||
palignr xmm1, xmm1, 8
|
||||
movq xmm3, qword ptr [eax + edi]
|
||||
lea eax, [eax + 2 * edi]
|
||||
punpcklbw xmm2, xmm3
|
||||
movdqa xmm3, xmm2
|
||||
movq xmm4, qword ptr [eax]
|
||||
palignr xmm3, xmm3, 8
|
||||
movq xmm5, qword ptr [eax + edi]
|
||||
punpcklbw xmm4, xmm5
|
||||
lea eax, [eax + 2 * edi]
|
||||
movdqa xmm5, xmm4
|
||||
movq xmm6, qword ptr [eax]
|
||||
palignr xmm5, xmm5, 8
|
||||
movq xmm7, qword ptr [eax + edi]
|
||||
punpcklbw xmm6, xmm7
|
||||
mov eax, ebp
|
||||
movdqa xmm7, xmm6
|
||||
palignr xmm7, xmm7, 8
|
||||
// Second round of bit swap.
|
||||
punpcklwd xmm0, xmm2
|
||||
punpcklwd xmm1, xmm3
|
||||
movdqa xmm2, xmm0
|
||||
movdqa xmm3, xmm1
|
||||
palignr xmm2, xmm2, 8
|
||||
palignr xmm3, xmm3, 8
|
||||
punpcklwd xmm4, xmm6
|
||||
punpcklwd xmm5, xmm7
|
||||
movdqa xmm6, xmm4
|
||||
movdqa xmm7, xmm5
|
||||
palignr xmm6, xmm6, 8
|
||||
palignr xmm7, xmm7, 8
|
||||
// Third round of bit swap.
|
||||
// Write to the destination pointer.
|
||||
punpckldq xmm0, xmm4
|
||||
movq qword ptr [edx], xmm0
|
||||
movdqa xmm4, xmm0
|
||||
palignr xmm4, xmm4, 8
|
||||
movq qword ptr [edx + esi], xmm4
|
||||
lea edx, [edx + 2 * esi]
|
||||
punpckldq xmm2, xmm6
|
||||
movdqa xmm6, xmm2
|
||||
palignr xmm6, xmm6, 8
|
||||
movq qword ptr [edx], xmm2
|
||||
punpckldq xmm1, xmm5
|
||||
movq qword ptr [edx + esi], xmm6
|
||||
lea edx, [edx + 2 * esi]
|
||||
movdqa xmm5, xmm1
|
||||
movq qword ptr [edx], xmm1
|
||||
palignr xmm5, xmm5, 8
|
||||
punpckldq xmm3, xmm7
|
||||
movq qword ptr [edx + esi], xmm5
|
||||
lea edx, [edx + 2 * esi]
|
||||
movq qword ptr [edx], xmm3
|
||||
movdqa xmm7, xmm3
|
||||
palignr xmm7, xmm7, 8
|
||||
sub ecx, 8
|
||||
movq qword ptr [edx + esi], xmm7
|
||||
lea edx, [edx + 2 * esi]
|
||||
jg convertloop
|
||||
|
||||
pop ebp
|
||||
pop esi
|
||||
pop edi
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
|
||||
uint8* dst_a, int dst_stride_a,
|
||||
uint8* dst_b, int dst_stride_b,
|
||||
int w) {
|
||||
__asm {
|
||||
push ebx
|
||||
push esi
|
||||
push edi
|
||||
push ebp
|
||||
mov eax, [esp + 16 + 4] // src
|
||||
mov edi, [esp + 16 + 8] // src_stride
|
||||
mov edx, [esp + 16 + 12] // dst_a
|
||||
mov esi, [esp + 16 + 16] // dst_stride_a
|
||||
mov ebx, [esp + 16 + 20] // dst_b
|
||||
mov ebp, [esp + 16 + 24] // dst_stride_b
|
||||
mov ecx, esp
|
||||
sub esp, 4 + 16
|
||||
and esp, ~15
|
||||
mov [esp + 16], ecx
|
||||
mov ecx, [ecx + 16 + 28] // w
|
||||
|
||||
align 4
|
||||
convertloop:
|
||||
// Read in the data from the source pointer.
|
||||
// First round of bit swap.
|
||||
movdqu xmm0, [eax]
|
||||
movdqu xmm1, [eax + edi]
|
||||
lea eax, [eax + 2 * edi]
|
||||
movdqa xmm7, xmm0 // use xmm7 as temp register.
|
||||
punpcklbw xmm0, xmm1
|
||||
punpckhbw xmm7, xmm1
|
||||
movdqa xmm1, xmm7
|
||||
movdqu xmm2, [eax]
|
||||
movdqu xmm3, [eax + edi]
|
||||
lea eax, [eax + 2 * edi]
|
||||
movdqa xmm7, xmm2
|
||||
punpcklbw xmm2, xmm3
|
||||
punpckhbw xmm7, xmm3
|
||||
movdqa xmm3, xmm7
|
||||
movdqu xmm4, [eax]
|
||||
movdqu xmm5, [eax + edi]
|
||||
lea eax, [eax + 2 * edi]
|
||||
movdqa xmm7, xmm4
|
||||
punpcklbw xmm4, xmm5
|
||||
punpckhbw xmm7, xmm5
|
||||
movdqa xmm5, xmm7
|
||||
movdqu xmm6, [eax]
|
||||
movdqu xmm7, [eax + edi]
|
||||
lea eax, [eax + 2 * edi]
|
||||
movdqu [esp], xmm5 // backup xmm5
|
||||
neg edi
|
||||
movdqa xmm5, xmm6 // use xmm5 as temp register.
|
||||
punpcklbw xmm6, xmm7
|
||||
punpckhbw xmm5, xmm7
|
||||
movdqa xmm7, xmm5
|
||||
lea eax, [eax + 8 * edi + 16]
|
||||
neg edi
|
||||
// Second round of bit swap.
|
||||
movdqa xmm5, xmm0
|
||||
punpcklwd xmm0, xmm2
|
||||
punpckhwd xmm5, xmm2
|
||||
movdqa xmm2, xmm5
|
||||
movdqa xmm5, xmm1
|
||||
punpcklwd xmm1, xmm3
|
||||
punpckhwd xmm5, xmm3
|
||||
movdqa xmm3, xmm5
|
||||
movdqa xmm5, xmm4
|
||||
punpcklwd xmm4, xmm6
|
||||
punpckhwd xmm5, xmm6
|
||||
movdqa xmm6, xmm5
|
||||
movdqu xmm5, [esp] // restore xmm5
|
||||
movdqu [esp], xmm6 // backup xmm6
|
||||
movdqa xmm6, xmm5 // use xmm6 as temp register.
|
||||
punpcklwd xmm5, xmm7
|
||||
punpckhwd xmm6, xmm7
|
||||
movdqa xmm7, xmm6
|
||||
// Third round of bit swap.
|
||||
// Write to the destination pointer.
|
||||
movdqa xmm6, xmm0
|
||||
punpckldq xmm0, xmm4
|
||||
punpckhdq xmm6, xmm4
|
||||
movdqa xmm4, xmm6
|
||||
movdqu xmm6, [esp] // restore xmm6
|
||||
movlpd qword ptr [edx], xmm0
|
||||
movhpd qword ptr [ebx], xmm0
|
||||
movlpd qword ptr [edx + esi], xmm4
|
||||
lea edx, [edx + 2 * esi]
|
||||
movhpd qword ptr [ebx + ebp], xmm4
|
||||
lea ebx, [ebx + 2 * ebp]
|
||||
movdqa xmm0, xmm2 // use xmm0 as the temp register.
|
||||
punpckldq xmm2, xmm6
|
||||
movlpd qword ptr [edx], xmm2
|
||||
movhpd qword ptr [ebx], xmm2
|
||||
punpckhdq xmm0, xmm6
|
||||
movlpd qword ptr [edx + esi], xmm0
|
||||
lea edx, [edx + 2 * esi]
|
||||
movhpd qword ptr [ebx + ebp], xmm0
|
||||
lea ebx, [ebx + 2 * ebp]
|
||||
movdqa xmm0, xmm1 // use xmm0 as the temp register.
|
||||
punpckldq xmm1, xmm5
|
||||
movlpd qword ptr [edx], xmm1
|
||||
movhpd qword ptr [ebx], xmm1
|
||||
punpckhdq xmm0, xmm5
|
||||
movlpd qword ptr [edx + esi], xmm0
|
||||
lea edx, [edx + 2 * esi]
|
||||
movhpd qword ptr [ebx + ebp], xmm0
|
||||
lea ebx, [ebx + 2 * ebp]
|
||||
movdqa xmm0, xmm3 // use xmm0 as the temp register.
|
||||
punpckldq xmm3, xmm7
|
||||
movlpd qword ptr [edx], xmm3
|
||||
movhpd qword ptr [ebx], xmm3
|
||||
punpckhdq xmm0, xmm7
|
||||
sub ecx, 8
|
||||
movlpd qword ptr [edx + esi], xmm0
|
||||
lea edx, [edx + 2 * esi]
|
||||
movhpd qword ptr [ebx + ebp], xmm0
|
||||
lea ebx, [ebx + 2 * ebp]
|
||||
jg convertloop
|
||||
|
||||
mov esp, [esp + 16]
|
||||
pop ebp
|
||||
pop edi
|
||||
pop esi
|
||||
pop ebx
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
1145
third_party/libyuv/source/row_any.cc
vendored
1145
third_party/libyuv/source/row_any.cc
vendored
File diff suppressed because it is too large
Load Diff
227
third_party/libyuv/source/row_common.cc
vendored
227
third_party/libyuv/source/row_common.cc
vendored
@ -199,28 +199,36 @@ void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
|
||||
}
|
||||
}
|
||||
|
||||
// dither4 is a row of 4 values from 4x4 dither matrix.
|
||||
// The 4x4 matrix contains values to increase RGB. When converting to
|
||||
// fewer bits (565) this provides an ordered dither.
|
||||
// The order in the 4x4 matrix in first byte is upper left.
|
||||
// The 4 values are passed as an int, then referenced as an array, so
|
||||
// endian will not affect order of the original matrix. But the dither4
|
||||
// will containing the first pixel in the lower byte for little endian
|
||||
// or the upper byte for big endian.
|
||||
void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb,
|
||||
const uint8* dither8x8, int width) {
|
||||
const uint32 dither4, int width) {
|
||||
int x;
|
||||
for (x = 0; x < width - 1; x += 2) {
|
||||
int dither0 = dither8x8[x & 7] - 128;
|
||||
int dither1 = dither8x8[(x & 7) + 1] - 128;
|
||||
uint8 b0 = Clamp(src_argb[0] + dither0) >> 3;
|
||||
uint8 g0 = Clamp(src_argb[1] + dither0) >> 2;
|
||||
uint8 r0 = Clamp(src_argb[2] + dither0) >> 3;
|
||||
uint8 b1 = Clamp(src_argb[4] + dither1) >> 3;
|
||||
uint8 g1 = Clamp(src_argb[5] + dither1) >> 2;
|
||||
uint8 r1 = Clamp(src_argb[6] + dither1) >> 3;
|
||||
int dither0 = ((const unsigned char*)(&dither4))[x & 3];
|
||||
int dither1 = ((const unsigned char*)(&dither4))[(x + 1) & 3];
|
||||
uint8 b0 = clamp255(src_argb[0] + dither0) >> 3;
|
||||
uint8 g0 = clamp255(src_argb[1] + dither0) >> 2;
|
||||
uint8 r0 = clamp255(src_argb[2] + dither0) >> 3;
|
||||
uint8 b1 = clamp255(src_argb[4] + dither1) >> 3;
|
||||
uint8 g1 = clamp255(src_argb[5] + dither1) >> 2;
|
||||
uint8 r1 = clamp255(src_argb[6] + dither1) >> 3;
|
||||
WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) |
|
||||
(b1 << 16) | (g1 << 21) | (r1 << 27));
|
||||
dst_rgb += 4;
|
||||
src_argb += 8;
|
||||
}
|
||||
if (width & 1) {
|
||||
int dither0 = dither8x8[(width - 1) & 7] - 128;
|
||||
uint8 b0 = Clamp(src_argb[0] + dither0) >> 3;
|
||||
uint8 g0 = Clamp(src_argb[1] + dither0) >> 2;
|
||||
uint8 r0 = Clamp(src_argb[2] + dither0) >> 3;
|
||||
int dither0 = ((const unsigned char*)(&dither4))[(width - 1) & 3];
|
||||
uint8 b0 = clamp255(src_argb[0] + dither0) >> 3;
|
||||
uint8 g0 = clamp255(src_argb[1] + dither0) >> 2;
|
||||
uint8 r0 = clamp255(src_argb[2] + dither0) >> 3;
|
||||
*(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
|
||||
}
|
||||
}
|
||||
@ -974,7 +982,7 @@ void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely,
|
||||
}
|
||||
}
|
||||
|
||||
void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
|
||||
void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
|
||||
// Copy a Y to RGB.
|
||||
int x;
|
||||
for (x = 0; x < width; ++x) {
|
||||
@ -986,38 +994,42 @@ void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
|
||||
}
|
||||
}
|
||||
|
||||
// YUV to RGB conversion constants.
|
||||
// BT.601 YUV to RGB reference
|
||||
// R = (Y - 16) * 1.164 - V * -1.596
|
||||
// G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813
|
||||
// B = (Y - 16) * 1.164 - U * -2.018
|
||||
|
||||
// Y contribution to R,G,B. Scale and bias.
|
||||
// TODO(fbarchard): Consider moving constants into a common header.
|
||||
#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
|
||||
#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */
|
||||
#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
|
||||
|
||||
// U and V contributions to R,G,B.
|
||||
#define UB -128 /* -min(128, round(2.018 * 64)) */
|
||||
#define UG 25 /* -round(-0.391 * 64) */
|
||||
#define VG 52 /* -round(-0.813 * 64) */
|
||||
#define VR -102 /* -round(1.596 * 64) */
|
||||
#define UB -128 /* max(-128, round(-2.018 * 64)) */
|
||||
#define UG 25 /* round(0.391 * 64) */
|
||||
#define VG 52 /* round(0.813 * 64) */
|
||||
#define VR -102 /* round(-1.596 * 64) */
|
||||
|
||||
// Bias values to subtract 16 from Y and 128 from U and V.
|
||||
#define BB (UB * 128 - YGB)
|
||||
#define BG (UG * 128 + VG * 128 - YGB)
|
||||
#define BR (VR * 128 - YGB)
|
||||
#define BB (UB * 128 + YGB)
|
||||
#define BG (UG * 128 + VG * 128 + YGB)
|
||||
#define BR (VR * 128 + YGB)
|
||||
|
||||
// C reference code that mimics the YUV assembly.
|
||||
static __inline void YuvPixel(uint8 y, uint8 u, uint8 v,
|
||||
uint8* b, uint8* g, uint8* r) {
|
||||
uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16;
|
||||
*b = Clamp((int32)(BB - ( u * UB) + y1) >> 6);
|
||||
*g = Clamp((int32)(BG - (v * VG + u * UG) + y1) >> 6);
|
||||
*r = Clamp((int32)(BR - (v * VR ) + y1) >> 6);
|
||||
*b = Clamp((int32)(-(u * UB) + y1 + BB) >> 6);
|
||||
*g = Clamp((int32)(-(v * VG + u * UG) + y1 + BG) >> 6);
|
||||
*r = Clamp((int32)(-(v * VR)+ y1 + BR) >> 6);
|
||||
}
|
||||
|
||||
// C reference code that mimics the YUV assembly.
|
||||
static __inline void YPixel(uint8 y, uint8* b, uint8* g, uint8* r) {
|
||||
uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16;
|
||||
*b = Clamp((int32)(y1 - YGB) >> 6);
|
||||
*g = Clamp((int32)(y1 - YGB) >> 6);
|
||||
*r = Clamp((int32)(y1 - YGB) >> 6);
|
||||
*b = Clamp((int32)(y1 + YGB) >> 6);
|
||||
*g = Clamp((int32)(y1 + YGB) >> 6);
|
||||
*r = Clamp((int32)(y1 + YGB) >> 6);
|
||||
}
|
||||
|
||||
#undef YG
|
||||
@ -1030,6 +1042,46 @@ static __inline void YPixel(uint8 y, uint8* b, uint8* g, uint8* r) {
|
||||
#undef BG
|
||||
#undef BR
|
||||
|
||||
// JPEG YUV to RGB reference
|
||||
// * R = Y - V * -1.40200
|
||||
// * G = Y - U * 0.34414 - V * 0.71414
|
||||
// * B = Y - U * -1.77200
|
||||
|
||||
// Y contribution to R,G,B. Scale and bias.
|
||||
// TODO(fbarchard): Consider moving constants into a common header.
|
||||
#define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
|
||||
#define YGBJ 32 /* 64 / 2 */
|
||||
|
||||
// U and V contributions to R,G,B.
|
||||
#define UBJ -113 /* round(-1.77200 * 64) */
|
||||
#define UGJ 22 /* round(0.34414 * 64) */
|
||||
#define VGJ 46 /* round(0.71414 * 64) */
|
||||
#define VRJ -90 /* round(-1.40200 * 64) */
|
||||
|
||||
// Bias values to subtract 16 from Y and 128 from U and V.
|
||||
#define BBJ (UBJ * 128 + YGBJ)
|
||||
#define BGJ (UGJ * 128 + VGJ * 128 + YGBJ)
|
||||
#define BRJ (VRJ * 128 + YGBJ)
|
||||
|
||||
// C reference code that mimics the YUV assembly.
|
||||
static __inline void YuvJPixel(uint8 y, uint8 u, uint8 v,
|
||||
uint8* b, uint8* g, uint8* r) {
|
||||
uint32 y1 = (uint32)(y * 0x0101 * YGJ) >> 16;
|
||||
*b = Clamp((int32)(-(u * UBJ) + y1 + BBJ) >> 6);
|
||||
*g = Clamp((int32)(-(v * VGJ + u * UGJ) + y1 + BGJ) >> 6);
|
||||
*r = Clamp((int32)(-(v * VRJ) + y1 + BRJ) >> 6);
|
||||
}
|
||||
|
||||
#undef YGJ
|
||||
#undef YGBJ
|
||||
#undef UBJ
|
||||
#undef UGJ
|
||||
#undef VGJ
|
||||
#undef VRJ
|
||||
#undef BBJ
|
||||
#undef BGJ
|
||||
#undef BRJ
|
||||
|
||||
#if !defined(LIBYUV_DISABLE_NEON) && \
|
||||
(defined(__ARM_NEON__) || defined(__aarch64__) || defined(LIBYUV_NEON))
|
||||
// C mimic assembly.
|
||||
@ -1102,34 +1154,6 @@ void I422ToARGBRow_C(const uint8* src_y,
|
||||
}
|
||||
}
|
||||
|
||||
// C reference code that mimics the YUV assembly.
|
||||
// * R = Y + 1.40200 * Cr
|
||||
// * G = Y - 0.34414 * Cb - 0.71414 * Cr
|
||||
// * B = Y + 1.77200 * Cb
|
||||
|
||||
#define YGJ 64 /* (int8)round(1.000 * 64) */
|
||||
|
||||
#define UBJ 113 /* (int8)round(1.772 * 64) */
|
||||
#define UGJ -22 /* (int8)round(-0.34414 * 64) */
|
||||
#define URJ 0
|
||||
|
||||
#define VBJ 0
|
||||
#define VGJ -46 /* (int8)round(-0.71414 * 64) */
|
||||
#define VRJ 90 /* (int8)round(1.402 * 64) */
|
||||
|
||||
// Bias
|
||||
#define BBJ (UBJ * 128 + VBJ * 128)
|
||||
#define BGJ (UGJ * 128 + VGJ * 128)
|
||||
#define BRJ (URJ * 128 + VRJ * 128)
|
||||
|
||||
static __inline void YuvJPixel(uint8 y, uint8 u, uint8 v,
|
||||
uint8* b, uint8* g, uint8* r) {
|
||||
uint32 y1 = (uint32)(y * YGJ);
|
||||
*b = Clamp((int32)(u * UBJ + v * VBJ + y1 - BBJ) >> 6);
|
||||
*g = Clamp((int32)(u * UGJ + v * VGJ + y1 - BGJ) >> 6);
|
||||
*r = Clamp((int32)(u * URJ + v * VRJ + y1 - BRJ) >> 6);
|
||||
}
|
||||
|
||||
void J422ToARGBRow_C(const uint8* src_y,
|
||||
const uint8* src_u,
|
||||
const uint8* src_v,
|
||||
@ -1354,23 +1378,23 @@ void I411ToARGBRow_C(const uint8* src_y,
|
||||
}
|
||||
|
||||
void NV12ToARGBRow_C(const uint8* src_y,
|
||||
const uint8* usrc_v,
|
||||
const uint8* src_uv,
|
||||
uint8* rgb_buf,
|
||||
int width) {
|
||||
int x;
|
||||
for (x = 0; x < width - 1; x += 2) {
|
||||
YuvPixel(src_y[0], usrc_v[0], usrc_v[1],
|
||||
YuvPixel(src_y[0], src_uv[0], src_uv[1],
|
||||
rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
|
||||
rgb_buf[3] = 255;
|
||||
YuvPixel(src_y[1], usrc_v[0], usrc_v[1],
|
||||
YuvPixel(src_y[1], src_uv[0], src_uv[1],
|
||||
rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
|
||||
rgb_buf[7] = 255;
|
||||
src_y += 2;
|
||||
usrc_v += 2;
|
||||
src_uv += 2;
|
||||
rgb_buf += 8; // Advance 2 pixels.
|
||||
}
|
||||
if (width & 1) {
|
||||
YuvPixel(src_y[0], usrc_v[0], usrc_v[1],
|
||||
YuvPixel(src_y[0], src_uv[0], src_uv[1],
|
||||
rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
|
||||
rgb_buf[3] = 255;
|
||||
}
|
||||
@ -1402,7 +1426,7 @@ void NV21ToARGBRow_C(const uint8* src_y,
|
||||
}
|
||||
|
||||
void NV12ToRGB565Row_C(const uint8* src_y,
|
||||
const uint8* usrc_v,
|
||||
const uint8* src_uv,
|
||||
uint8* dst_rgb565,
|
||||
int width) {
|
||||
uint8 b0;
|
||||
@ -1413,8 +1437,8 @@ void NV12ToRGB565Row_C(const uint8* src_y,
|
||||
uint8 r1;
|
||||
int x;
|
||||
for (x = 0; x < width - 1; x += 2) {
|
||||
YuvPixel(src_y[0], usrc_v[0], usrc_v[1], &b0, &g0, &r0);
|
||||
YuvPixel(src_y[1], usrc_v[0], usrc_v[1], &b1, &g1, &r1);
|
||||
YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0);
|
||||
YuvPixel(src_y[1], src_uv[0], src_uv[1], &b1, &g1, &r1);
|
||||
b0 = b0 >> 3;
|
||||
g0 = g0 >> 2;
|
||||
r0 = r0 >> 3;
|
||||
@ -1424,11 +1448,11 @@ void NV12ToRGB565Row_C(const uint8* src_y,
|
||||
*(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
|
||||
(b1 << 16) | (g1 << 21) | (r1 << 27);
|
||||
src_y += 2;
|
||||
usrc_v += 2;
|
||||
src_uv += 2;
|
||||
dst_rgb565 += 4; // Advance 2 pixels.
|
||||
}
|
||||
if (width & 1) {
|
||||
YuvPixel(src_y[0], usrc_v[0], usrc_v[1], &b0, &g0, &r0);
|
||||
YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0);
|
||||
b0 = b0 >> 3;
|
||||
g0 = g0 >> 2;
|
||||
r0 = r0 >> 3;
|
||||
@ -1588,7 +1612,7 @@ void I422ToRGBARow_C(const uint8* src_y,
|
||||
}
|
||||
}
|
||||
|
||||
void YToARGBRow_C(const uint8* src_y, uint8* rgb_buf, int width) {
|
||||
void I400ToARGBRow_C(const uint8* src_y, uint8* rgb_buf, int width) {
|
||||
int x;
|
||||
for (x = 0; x < width - 1; x += 2) {
|
||||
YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
|
||||
@ -2062,22 +2086,6 @@ void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr,
|
||||
}
|
||||
}
|
||||
|
||||
// Select G channel from ARGB. e.g. GGGGGGGG
|
||||
void ARGBToBayerGGRow_C(const uint8* src_argb,
|
||||
uint8* dst_bayer, uint32 selector, int pix) {
|
||||
// Copy a row of G.
|
||||
int x;
|
||||
for (x = 0; x < pix - 1; x += 2) {
|
||||
dst_bayer[0] = src_argb[1];
|
||||
dst_bayer[1] = src_argb[5];
|
||||
src_argb += 8;
|
||||
dst_bayer += 2;
|
||||
}
|
||||
if (pix & 1) {
|
||||
dst_bayer[0] = src_argb[1];
|
||||
}
|
||||
}
|
||||
|
||||
// Use first 4 shuffler values to reorder ARGB channels.
|
||||
void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
|
||||
const uint8* shuffler, int pix) {
|
||||
@ -2120,7 +2128,7 @@ void I422ToYUY2Row_C(const uint8* src_y,
|
||||
if (width & 1) {
|
||||
dst_frame[0] = src_y[0];
|
||||
dst_frame[1] = src_u[0];
|
||||
dst_frame[2] = src_y[0]; // duplicate last y
|
||||
dst_frame[2] = 0;
|
||||
dst_frame[3] = src_v[0];
|
||||
}
|
||||
}
|
||||
@ -2144,14 +2152,15 @@ void I422ToUYVYRow_C(const uint8* src_y,
|
||||
dst_frame[0] = src_u[0];
|
||||
dst_frame[1] = src_y[0];
|
||||
dst_frame[2] = src_v[0];
|
||||
dst_frame[3] = src_y[0]; // duplicate last y
|
||||
dst_frame[3] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// Maximum temporary width for wrappers to process at a time, in pixels.
|
||||
#define MAXTWIDTH 2048
|
||||
|
||||
#if !defined(_MSC_VER) && defined(HAS_I422TORGB565ROW_SSSE3)
|
||||
#if !(defined(_MSC_VER) && !defined(__clang__)) && \
|
||||
defined(HAS_I422TORGB565ROW_SSSE3)
|
||||
// row_win.cc has asm version, but GCC uses 2 step wrapper.
|
||||
void I422ToRGB565Row_SSSE3(const uint8* src_y,
|
||||
const uint8* src_u,
|
||||
@ -2346,6 +2355,50 @@ void I422ToARGB4444Row_AVX2(const uint8* src_y,
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(HAS_I422TORGB24ROW_AVX2)
|
||||
void I422ToRGB24Row_AVX2(const uint8* src_y,
|
||||
const uint8* src_u,
|
||||
const uint8* src_v,
|
||||
uint8* dst_rgb24,
|
||||
int width) {
|
||||
// Row buffer for intermediate ARGB pixels.
|
||||
SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
|
||||
while (width > 0) {
|
||||
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
|
||||
I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);
|
||||
// TODO(fbarchard): ARGBToRGB24Row_AVX2
|
||||
ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
|
||||
src_y += twidth;
|
||||
src_u += twidth / 2;
|
||||
src_v += twidth / 2;
|
||||
dst_rgb24 += twidth * 3;
|
||||
width -= twidth;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(HAS_I422TORAWROW_AVX2)
|
||||
void I422ToRAWRow_AVX2(const uint8* src_y,
|
||||
const uint8* src_u,
|
||||
const uint8* src_v,
|
||||
uint8* dst_raw,
|
||||
int width) {
|
||||
// Row buffer for intermediate ARGB pixels.
|
||||
SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
|
||||
while (width > 0) {
|
||||
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
|
||||
I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);
|
||||
// TODO(fbarchard): ARGBToRAWRow_AVX2
|
||||
ARGBToRAWRow_SSSE3(row, dst_raw, twidth);
|
||||
src_y += twidth;
|
||||
src_u += twidth / 2;
|
||||
src_v += twidth / 2;
|
||||
dst_raw += twidth * 3;
|
||||
width -= twidth;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(HAS_NV12TORGB565ROW_AVX2)
|
||||
void NV12ToRGB565Row_AVX2(const uint8* src_y, const uint8* src_uv,
|
||||
uint8* dst_rgb565, int width) {
|
||||
|
@ -236,8 +236,8 @@ void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
|
||||
}
|
||||
#endif // TESTING
|
||||
|
||||
#ifdef HAS_I400TOARGBROW_SSE2
|
||||
void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
|
||||
#ifdef HAS_J400TOARGBROW_SSE2
|
||||
void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
|
||||
asm volatile (
|
||||
"pcmpeqb %%xmm5,%%xmm5 \n"
|
||||
"pslld $0x18,%%xmm5 \n"
|
||||
@ -262,7 +262,7 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
|
||||
:: "memory", "cc", "xmm0", "xmm1", "xmm5"
|
||||
);
|
||||
}
|
||||
#endif // HAS_I400TOARGBROW_SSE2
|
||||
#endif // HAS_J400TOARGBROW_SSE2
|
||||
|
||||
#ifdef HAS_RGB24TOARGBROW_SSSE3
|
||||
void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
|
||||
@ -953,7 +953,6 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
|
||||
#endif // HAS_ARGBTOUVROW_AVX2
|
||||
|
||||
#ifdef HAS_ARGBTOUVJROW_SSSE3
|
||||
// TODO(fbarchard): Share code with ARGBToUVRow_SSSE3.
|
||||
void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width) {
|
||||
asm volatile (
|
||||
@ -1414,22 +1413,6 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
|
||||
|
||||
#if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
|
||||
|
||||
// YUV to RGB conversion constants.
|
||||
// Y contribution to R,G,B. Scale and bias.
|
||||
#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
|
||||
#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */
|
||||
|
||||
// U and V contributions to R,G,B.
|
||||
#define UB -128 /* -min(128, round(2.018 * 64)) */
|
||||
#define UG 25 /* -round(-0.391 * 64) */
|
||||
#define VG 52 /* -round(-0.813 * 64) */
|
||||
#define VR -102 /* -round(1.596 * 64) */
|
||||
|
||||
// Bias values to subtract 16 from Y and 128 from U and V.
|
||||
#define BB (UB * 128 - YGB)
|
||||
#define BG (UG * 128 + VG * 128 - YGB)
|
||||
#define BR (VR * 128 - YGB)
|
||||
|
||||
struct YuvConstants {
|
||||
lvec8 kUVToB; // 0
|
||||
lvec8 kUVToG; // 32
|
||||
@ -1440,6 +1423,27 @@ struct YuvConstants {
|
||||
lvec16 kYToRgb; // 192
|
||||
};
|
||||
|
||||
// BT.601 YUV to RGB reference
|
||||
// R = (Y - 16) * 1.164 - V * -1.596
|
||||
// G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813
|
||||
// B = (Y - 16) * 1.164 - U * -2.018
|
||||
|
||||
// Y contribution to R,G,B. Scale and bias.
|
||||
// TODO(fbarchard): Consider moving constants into a common header.
|
||||
#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
|
||||
#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
|
||||
|
||||
// U and V contributions to R,G,B.
|
||||
#define UB -128 /* max(-128, round(-2.018 * 64)) */
|
||||
#define UG 25 /* round(0.391 * 64) */
|
||||
#define VG 52 /* round(0.813 * 64) */
|
||||
#define VR -102 /* round(-1.596 * 64) */
|
||||
|
||||
// Bias values to subtract 16 from Y and 128 from U and V.
|
||||
#define BB (UB * 128 + YGB)
|
||||
#define BG (UG * 128 + VG * 128 + YGB)
|
||||
#define BR (VR * 128 + YGB)
|
||||
|
||||
// BT601 constants for YUV to RGB.
|
||||
static YuvConstants SIMD_ALIGNED(kYuvConstants) = {
|
||||
{ UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
|
||||
@ -1468,6 +1472,67 @@ static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
|
||||
{ YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
|
||||
};
|
||||
|
||||
#undef YG
|
||||
#undef YGB
|
||||
#undef UB
|
||||
#undef UG
|
||||
#undef VG
|
||||
#undef VR
|
||||
#undef BB
|
||||
#undef BG
|
||||
#undef BR
|
||||
|
||||
// JPEG YUV to RGB reference
|
||||
// * R = Y - V * -1.40200
|
||||
// * G = Y - U * 0.34414 - V * 0.71414
|
||||
// * B = Y - U * -1.77200
|
||||
|
||||
// Y contribution to R,G,B. Scale and bias.
|
||||
// TODO(fbarchard): Consider moving constants into a common header.
|
||||
#define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
|
||||
#define YGBJ 32 /* 64 / 2 */
|
||||
|
||||
// U and V contributions to R,G,B.
|
||||
#define UBJ -113 /* round(-1.77200 * 64) */
|
||||
#define UGJ 22 /* round(0.34414 * 64) */
|
||||
#define VGJ 46 /* round(0.71414 * 64) */
|
||||
#define VRJ -90 /* round(-1.40200 * 64) */
|
||||
|
||||
// Bias values to subtract 16 from Y and 128 from U and V.
|
||||
#define BBJ (UBJ * 128 + YGBJ)
|
||||
#define BGJ (UGJ * 128 + VGJ * 128 + YGBJ)
|
||||
#define BRJ (VRJ * 128 + YGBJ)
|
||||
|
||||
// JPEG constants for YUV to RGB.
|
||||
YuvConstants SIMD_ALIGNED(kYuvJConstants) = {
|
||||
{ UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0,
|
||||
UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0 },
|
||||
{ UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
|
||||
UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
|
||||
UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
|
||||
UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ },
|
||||
{ 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ,
|
||||
0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ },
|
||||
{ BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ,
|
||||
BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ },
|
||||
{ BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ,
|
||||
BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ },
|
||||
{ BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ,
|
||||
BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ },
|
||||
{ YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ,
|
||||
YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ }
|
||||
};
|
||||
|
||||
#undef YGJ
|
||||
#undef YGBJ
|
||||
#undef UBJ
|
||||
#undef UGJ
|
||||
#undef VGJ
|
||||
#undef VRJ
|
||||
#undef BBJ
|
||||
#undef BGJ
|
||||
#undef BRJ
|
||||
|
||||
// Read 8 UV from 411
|
||||
#define READYUV444 \
|
||||
"movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
|
||||
@ -1534,8 +1599,8 @@ static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
|
||||
"punpcklwd %%xmm2,%%xmm0 \n" \
|
||||
"punpckhwd %%xmm2,%%xmm1 \n" \
|
||||
"movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \
|
||||
"movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) " \n" \
|
||||
"lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
|
||||
"movdqu %%xmm1," MEMACCESS2(0x10, [dst_argb]) " \n" \
|
||||
"lea " MEMLEA(0x20, [dst_argb]) ", %[dst_argb] \n"
|
||||
|
||||
// Store 8 BGRA values. Assumes XMM5 is zero.
|
||||
#define STOREBGRA \
|
||||
@ -1546,8 +1611,8 @@ static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
|
||||
"punpcklwd %%xmm1,%%xmm5 \n" \
|
||||
"punpckhwd %%xmm1,%%xmm0 \n" \
|
||||
"movdqu %%xmm5," MEMACCESS([dst_bgra]) " \n" \
|
||||
"movdqu %%xmm0," MEMACCESS2(0x10,[dst_bgra]) " \n" \
|
||||
"lea " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n"
|
||||
"movdqu %%xmm0," MEMACCESS2(0x10, [dst_bgra]) " \n" \
|
||||
"lea " MEMLEA(0x20, [dst_bgra]) ", %[dst_bgra] \n"
|
||||
|
||||
// Store 8 ABGR values. Assumes XMM5 is zero.
|
||||
#define STOREABGR \
|
||||
@ -1557,8 +1622,8 @@ static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
|
||||
"punpcklwd %%xmm0,%%xmm2 \n" \
|
||||
"punpckhwd %%xmm0,%%xmm1 \n" \
|
||||
"movdqu %%xmm2," MEMACCESS([dst_abgr]) " \n" \
|
||||
"movdqu %%xmm1," MEMACCESS2(0x10,[dst_abgr]) " \n" \
|
||||
"lea " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n"
|
||||
"movdqu %%xmm1," MEMACCESS2(0x10, [dst_abgr]) " \n" \
|
||||
"lea " MEMLEA(0x20, [dst_abgr]) ", %[dst_abgr] \n"
|
||||
|
||||
// Store 8 RGBA values. Assumes XMM5 is zero.
|
||||
#define STORERGBA \
|
||||
@ -1569,8 +1634,8 @@ static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
|
||||
"punpcklwd %%xmm1,%%xmm5 \n" \
|
||||
"punpckhwd %%xmm1,%%xmm0 \n" \
|
||||
"movdqu %%xmm5," MEMACCESS([dst_rgba]) " \n" \
|
||||
"movdqu %%xmm0," MEMACCESS2(0x10,[dst_rgba]) " \n" \
|
||||
"lea " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n"
|
||||
"movdqu %%xmm0," MEMACCESS2(0x10, [dst_rgba]) " \n" \
|
||||
"lea " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba] \n"
|
||||
|
||||
void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
@ -1713,6 +1778,32 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
|
||||
);
|
||||
}
|
||||
|
||||
void OMITFP J422ToARGBRow_SSSE3(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* dst_argb,
|
||||
int width) {
|
||||
asm volatile (
|
||||
"sub %[u_buf],%[v_buf] \n"
|
||||
"pcmpeqb %%xmm5,%%xmm5 \n"
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
READYUV422
|
||||
YUVTORGB(kYuvConstants)
|
||||
STOREARGB
|
||||
"sub $0x8,%[width] \n"
|
||||
"jg 1b \n"
|
||||
: [y_buf]"+r"(y_buf), // %[y_buf]
|
||||
[u_buf]"+r"(u_buf), // %[u_buf]
|
||||
[v_buf]"+r"(v_buf), // %[v_buf]
|
||||
[dst_argb]"+r"(dst_argb), // %[dst_argb]
|
||||
[width]"+rm"(width) // %[width]
|
||||
: [kYuvConstants]"r"(&kYuvJConstants.kUVToB) // %[kYuvConstants]
|
||||
: "memory", "cc", NACL_R14
|
||||
"xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
|
||||
);
|
||||
}
|
||||
|
||||
void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
@ -1881,10 +1972,10 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
|
||||
"vpmaddubsw " MEMACCESS([YuvConstants]) ",%%ymm0,%%ymm0 \n" \
|
||||
"vmovdqu " MEMACCESS2(160, [YuvConstants]) ",%%ymm3 \n" \
|
||||
"vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \
|
||||
"vmovdqu " MEMACCESS2(128, [YuvConstants]) ",%%ymm2 \n" \
|
||||
"vpsubw %%ymm1,%%ymm2,%%ymm1 \n" \
|
||||
"vmovdqu " MEMACCESS2(96, [YuvConstants]) ",%%ymm1 \n" \
|
||||
"vpsubw %%ymm0,%%ymm1,%%ymm0 \n" \
|
||||
"vmovdqu " MEMACCESS2(128, [YuvConstants]) ",%%ymm3 \n" \
|
||||
"vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \
|
||||
"vmovdqu " MEMACCESS2(96, [YuvConstants]) ",%%ymm3 \n" \
|
||||
"vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \
|
||||
"vmovdqu " MEMACCESS([y_buf]) ",%%xmm3 \n" \
|
||||
"lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \
|
||||
"vpermq $0xd8,%%ymm3,%%ymm3 \n" \
|
||||
@ -1984,6 +2075,48 @@ void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf,
|
||||
}
|
||||
#endif // HAS_I422TOARGBROW_AVX2
|
||||
|
||||
#if defined(HAS_J422TOARGBROW_AVX2)
|
||||
// 16 pixels
|
||||
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
|
||||
void OMITFP J422ToARGBRow_AVX2(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* dst_argb,
|
||||
int width) {
|
||||
asm volatile (
|
||||
"sub %[u_buf],%[v_buf] \n"
|
||||
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
READYUV422_AVX2
|
||||
YUVTORGB_AVX2(kYuvConstants)
|
||||
|
||||
// Step 3: Weave into ARGB
|
||||
"vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" // BG
|
||||
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
|
||||
"vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" // RA
|
||||
"vpermq $0xd8,%%ymm2,%%ymm2 \n"
|
||||
"vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" // BGRA first 8 pixels
|
||||
"vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" // BGRA next 8 pixels
|
||||
|
||||
"vmovdqu %%ymm1," MEMACCESS([dst_argb]) "\n"
|
||||
"vmovdqu %%ymm0," MEMACCESS2(0x20,[dst_argb]) "\n"
|
||||
"lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
|
||||
"sub $0x10,%[width] \n"
|
||||
"jg 1b \n"
|
||||
"vzeroupper \n"
|
||||
: [y_buf]"+r"(y_buf), // %[y_buf]
|
||||
[u_buf]"+r"(u_buf), // %[u_buf]
|
||||
[v_buf]"+r"(v_buf), // %[v_buf]
|
||||
[dst_argb]"+r"(dst_argb), // %[dst_argb]
|
||||
[width]"+rm"(width) // %[width]
|
||||
: [kYuvConstants]"r"(&kYuvJConstants.kUVToB) // %[kYuvConstants]
|
||||
: "memory", "cc", NACL_R14
|
||||
"xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
|
||||
);
|
||||
}
|
||||
#endif // HAS_J422TOARGBROW_AVX2
|
||||
|
||||
#if defined(HAS_I422TOABGRROW_AVX2)
|
||||
// 16 pixels
|
||||
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes).
|
||||
@ -2066,8 +2199,8 @@ void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf,
|
||||
}
|
||||
#endif // HAS_I422TORGBAROW_AVX2
|
||||
|
||||
#ifdef HAS_YTOARGBROW_SSE2
|
||||
void YToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {
|
||||
#ifdef HAS_I400TOARGBROW_SSE2
|
||||
void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {
|
||||
asm volatile (
|
||||
"mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164
|
||||
"movd %%eax,%%xmm2 \n"
|
||||
@ -2109,12 +2242,12 @@ void YToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {
|
||||
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
|
||||
);
|
||||
}
|
||||
#endif // HAS_YTOARGBROW_SSE2
|
||||
#endif // HAS_I400TOARGBROW_SSE2
|
||||
|
||||
#ifdef HAS_YTOARGBROW_AVX2
|
||||
#ifdef HAS_I400TOARGBROW_AVX2
|
||||
// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
|
||||
// note: vpunpcklbw mutates and vpackuswb unmutates.
|
||||
void YToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) {
|
||||
void I400ToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) {
|
||||
asm volatile (
|
||||
"mov $0x4a354a35,%%eax \n" // 0488 = 1160 = 1.164 * 16
|
||||
"vmovd %%eax,%%xmm2 \n"
|
||||
@ -2156,7 +2289,7 @@ void YToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) {
|
||||
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
|
||||
);
|
||||
}
|
||||
#endif // HAS_YTOARGBROW_AVX2
|
||||
#endif // HAS_I400TOARGBROW_AVX2
|
||||
|
||||
#ifdef HAS_MIRRORROW_SSSE3
|
||||
// Shuffle table for reversing the bytes.
|
||||
@ -3096,41 +3229,7 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
"psllw $0x8,%%xmm5 \n"
|
||||
"pcmpeqb %%xmm4,%%xmm4 \n"
|
||||
"pslld $0x18,%%xmm4 \n"
|
||||
"sub $0x1,%3 \n"
|
||||
"je 91f \n"
|
||||
"jl 99f \n"
|
||||
|
||||
// 1 pixel loop until destination pointer is aligned.
|
||||
"10: \n"
|
||||
"test $0xf,%2 \n"
|
||||
"je 19f \n"
|
||||
"movd " MEMACCESS(0) ",%%xmm3 \n"
|
||||
"lea " MEMLEA(0x4,0) ",%0 \n"
|
||||
"movdqa %%xmm3,%%xmm0 \n"
|
||||
"pxor %%xmm4,%%xmm3 \n"
|
||||
"movd " MEMACCESS(1) ",%%xmm2 \n"
|
||||
"psrlw $0x8,%%xmm3 \n"
|
||||
"pshufhw $0xf5,%%xmm3,%%xmm3 \n"
|
||||
"pshuflw $0xf5,%%xmm3,%%xmm3 \n"
|
||||
"pand %%xmm6,%%xmm2 \n"
|
||||
"paddw %%xmm7,%%xmm3 \n"
|
||||
"pmullw %%xmm3,%%xmm2 \n"
|
||||
"movd " MEMACCESS(1) ",%%xmm1 \n"
|
||||
"lea " MEMLEA(0x4,1) ",%1 \n"
|
||||
"psrlw $0x8,%%xmm1 \n"
|
||||
"por %%xmm4,%%xmm0 \n"
|
||||
"pmullw %%xmm3,%%xmm1 \n"
|
||||
"psrlw $0x8,%%xmm2 \n"
|
||||
"paddusb %%xmm2,%%xmm0 \n"
|
||||
"pand %%xmm5,%%xmm1 \n"
|
||||
"paddusb %%xmm1,%%xmm0 \n"
|
||||
"movd %%xmm0," MEMACCESS(2) " \n"
|
||||
"lea " MEMLEA(0x4,2) ",%2 \n"
|
||||
"sub $0x1,%3 \n"
|
||||
"jge 10b \n"
|
||||
|
||||
"19: \n"
|
||||
"add $1-4,%3 \n"
|
||||
"sub $0x4,%3 \n"
|
||||
"jl 49f \n"
|
||||
|
||||
// 4 pixel loop.
|
||||
@ -3231,39 +3330,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
|
||||
"psllw $0x8,%%xmm5 \n"
|
||||
"pcmpeqb %%xmm4,%%xmm4 \n"
|
||||
"pslld $0x18,%%xmm4 \n"
|
||||
"sub $0x1,%3 \n"
|
||||
"je 91f \n"
|
||||
"jl 99f \n"
|
||||
|
||||
// 1 pixel loop until destination pointer is aligned.
|
||||
"10: \n"
|
||||
"test $0xf,%2 \n"
|
||||
"je 19f \n"
|
||||
"movd " MEMACCESS(0) ",%%xmm3 \n"
|
||||
"lea " MEMLEA(0x4,0) ",%0 \n"
|
||||
"movdqa %%xmm3,%%xmm0 \n"
|
||||
"pxor %%xmm4,%%xmm3 \n"
|
||||
"movd " MEMACCESS(1) ",%%xmm2 \n"
|
||||
"pshufb %4,%%xmm3 \n"
|
||||
"pand %%xmm6,%%xmm2 \n"
|
||||
"paddw %%xmm7,%%xmm3 \n"
|
||||
"pmullw %%xmm3,%%xmm2 \n"
|
||||
"movd " MEMACCESS(1) ",%%xmm1 \n"
|
||||
"lea " MEMLEA(0x4,1) ",%1 \n"
|
||||
"psrlw $0x8,%%xmm1 \n"
|
||||
"por %%xmm4,%%xmm0 \n"
|
||||
"pmullw %%xmm3,%%xmm1 \n"
|
||||
"psrlw $0x8,%%xmm2 \n"
|
||||
"paddusb %%xmm2,%%xmm0 \n"
|
||||
"pand %%xmm5,%%xmm1 \n"
|
||||
"paddusb %%xmm1,%%xmm0 \n"
|
||||
"movd %%xmm0," MEMACCESS(2) " \n"
|
||||
"lea " MEMLEA(0x4,2) ",%2 \n"
|
||||
"sub $0x1,%3 \n"
|
||||
"jge 10b \n"
|
||||
|
||||
"19: \n"
|
||||
"add $1-4,%3 \n"
|
||||
"sub $0x4,%3 \n"
|
||||
"jl 49f \n"
|
||||
|
||||
// 4 pixel loop.
|
||||
@ -4897,37 +4964,6 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
|
||||
}
|
||||
#endif // HAS_INTERPOLATEROW_SSE2
|
||||
|
||||
#ifdef HAS_ARGBTOBAYERGGROW_SSE2
|
||||
void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
|
||||
uint32 selector, int pix) {
|
||||
asm volatile (
|
||||
"pcmpeqb %%xmm5,%%xmm5 \n"
|
||||
"psrld $0x18,%%xmm5 \n"
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
|
||||
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
|
||||
"lea " MEMLEA(0x20,0) ",%0 \n"
|
||||
"psrld $0x8,%%xmm0 \n"
|
||||
"psrld $0x8,%%xmm1 \n"
|
||||
"pand %%xmm5,%%xmm0 \n"
|
||||
"pand %%xmm5,%%xmm1 \n"
|
||||
"packssdw %%xmm1,%%xmm0 \n"
|
||||
"packuswb %%xmm1,%%xmm0 \n"
|
||||
"movq %%xmm0," MEMACCESS(1) " \n"
|
||||
"lea " MEMLEA(0x8,1) ",%1 \n"
|
||||
"sub $0x8,%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_bayer), // %1
|
||||
"+r"(pix) // %2
|
||||
:
|
||||
: "memory", "cc"
|
||||
, "xmm0", "xmm1", "xmm5"
|
||||
);
|
||||
}
|
||||
#endif // HAS_ARGBTOBAYERGGROW_SSE2
|
||||
|
||||
#ifdef HAS_ARGBSHUFFLEROW_SSSE3
|
||||
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
|
||||
void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
|
95
third_party/libyuv/source/row_neon.cc
vendored
95
third_party/libyuv/source/row_neon.cc
vendored
@ -94,11 +94,17 @@ extern "C" {
|
||||
"vtrn.u32 d2, d3 \n"
|
||||
|
||||
#define YUV422TORGB_SETUP_REG \
|
||||
MEMACCESS([kUVToRB]) \
|
||||
"vld1.8 {d24}, [%[kUVToRB]] \n" \
|
||||
MEMACCESS([kUVToG]) \
|
||||
"vld1.8 {d25}, [%[kUVToG]] \n" \
|
||||
MEMACCESS([kUVBiasBGR]) \
|
||||
"vld1.16 {d26[], d27[]}, [%[kUVBiasBGR]]! \n" \
|
||||
MEMACCESS([kUVBiasBGR]) \
|
||||
"vld1.16 {d8[], d9[]}, [%[kUVBiasBGR]]! \n" \
|
||||
MEMACCESS([kUVBiasBGR]) \
|
||||
"vld1.16 {d28[], d29[]}, [%[kUVBiasBGR]] \n" \
|
||||
MEMACCESS([kYToRgb]) \
|
||||
"vld1.32 {d30[], d31[]}, [%[kYToRgb]] \n"
|
||||
|
||||
#define YUV422TORGB \
|
||||
@ -186,7 +192,7 @@ void I444ToARGBRow_NEON(const uint8* src_y,
|
||||
[kUVToG]"r"(&kUVToG), // %6
|
||||
[kUVBiasBGR]"r"(&kUVBiasBGR),
|
||||
[kYToRgb]"r"(&kYToRgb)
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3",
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q4",
|
||||
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
|
||||
);
|
||||
}
|
||||
@ -216,7 +222,7 @@ void I422ToARGBRow_NEON(const uint8* src_y,
|
||||
[kUVToG]"r"(&kUVToG), // %6
|
||||
[kUVBiasBGR]"r"(&kUVBiasBGR),
|
||||
[kYToRgb]"r"(&kYToRgb)
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3",
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q4",
|
||||
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
|
||||
);
|
||||
}
|
||||
@ -246,7 +252,7 @@ void I411ToARGBRow_NEON(const uint8* src_y,
|
||||
[kUVToG]"r"(&kUVToG), // %6
|
||||
[kUVBiasBGR]"r"(&kUVBiasBGR),
|
||||
[kYToRgb]"r"(&kYToRgb)
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3",
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q4",
|
||||
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
|
||||
);
|
||||
}
|
||||
@ -277,7 +283,7 @@ void I422ToBGRARow_NEON(const uint8* src_y,
|
||||
[kUVToG]"r"(&kUVToG), // %6
|
||||
[kUVBiasBGR]"r"(&kUVBiasBGR),
|
||||
[kYToRgb]"r"(&kYToRgb)
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3",
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q4",
|
||||
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
|
||||
);
|
||||
}
|
||||
@ -308,7 +314,7 @@ void I422ToABGRRow_NEON(const uint8* src_y,
|
||||
[kUVToG]"r"(&kUVToG), // %6
|
||||
[kUVBiasBGR]"r"(&kUVBiasBGR),
|
||||
[kYToRgb]"r"(&kYToRgb)
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3",
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q4",
|
||||
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
|
||||
);
|
||||
}
|
||||
@ -338,7 +344,7 @@ void I422ToRGBARow_NEON(const uint8* src_y,
|
||||
[kUVToG]"r"(&kUVToG), // %6
|
||||
[kUVBiasBGR]"r"(&kUVBiasBGR),
|
||||
[kYToRgb]"r"(&kYToRgb)
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3",
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q4",
|
||||
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
|
||||
);
|
||||
}
|
||||
@ -367,7 +373,7 @@ void I422ToRGB24Row_NEON(const uint8* src_y,
|
||||
[kUVToG]"r"(&kUVToG), // %6
|
||||
[kUVBiasBGR]"r"(&kUVBiasBGR),
|
||||
[kYToRgb]"r"(&kYToRgb)
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3",
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q4",
|
||||
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
|
||||
);
|
||||
}
|
||||
@ -397,7 +403,7 @@ void I422ToRAWRow_NEON(const uint8* src_y,
|
||||
[kUVToG]"r"(&kUVToG), // %6
|
||||
[kUVBiasBGR]"r"(&kUVBiasBGR),
|
||||
[kYToRgb]"r"(&kYToRgb)
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3",
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q4",
|
||||
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
|
||||
);
|
||||
}
|
||||
@ -439,7 +445,7 @@ void I422ToRGB565Row_NEON(const uint8* src_y,
|
||||
[kUVToG]"r"(&kUVToG), // %6
|
||||
[kUVBiasBGR]"r"(&kUVBiasBGR),
|
||||
[kYToRgb]"r"(&kYToRgb)
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3",
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q4",
|
||||
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
|
||||
);
|
||||
}
|
||||
@ -485,7 +491,7 @@ void I422ToARGB1555Row_NEON(const uint8* src_y,
|
||||
[kUVToG]"r"(&kUVToG), // %6
|
||||
[kUVBiasBGR]"r"(&kUVBiasBGR),
|
||||
[kYToRgb]"r"(&kYToRgb)
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3",
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q4",
|
||||
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
|
||||
);
|
||||
}
|
||||
@ -526,14 +532,14 @@ void I422ToARGB4444Row_NEON(const uint8* src_y,
|
||||
[kUVToG]"r"(&kUVToG), // %6
|
||||
[kUVBiasBGR]"r"(&kUVBiasBGR),
|
||||
[kYToRgb]"r"(&kYToRgb)
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3",
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q4",
|
||||
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
|
||||
);
|
||||
}
|
||||
|
||||
void YToARGBRow_NEON(const uint8* src_y,
|
||||
uint8* dst_argb,
|
||||
int width) {
|
||||
void I400ToARGBRow_NEON(const uint8* src_y,
|
||||
uint8* dst_argb,
|
||||
int width) {
|
||||
asm volatile (
|
||||
YUV422TORGB_SETUP_REG
|
||||
".p2align 2 \n"
|
||||
@ -552,17 +558,17 @@ void YToARGBRow_NEON(const uint8* src_y,
|
||||
[kUVToG]"r"(&kUVToG), // %4
|
||||
[kUVBiasBGR]"r"(&kUVBiasBGR),
|
||||
[kYToRgb]"r"(&kYToRgb)
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3",
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q4",
|
||||
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
|
||||
);
|
||||
}
|
||||
|
||||
void I400ToARGBRow_NEON(const uint8* src_y,
|
||||
void J400ToARGBRow_NEON(const uint8* src_y,
|
||||
uint8* dst_argb,
|
||||
int width) {
|
||||
asm volatile (
|
||||
".p2align 2 \n"
|
||||
"vmov.u8 d23, #255 \n"
|
||||
".p2align 2 \n"
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"vld1.8 {d20}, [%0]! \n"
|
||||
@ -603,7 +609,7 @@ void NV12ToARGBRow_NEON(const uint8* src_y,
|
||||
[kUVToG]"r"(&kUVToG), // %5
|
||||
[kUVBiasBGR]"r"(&kUVBiasBGR),
|
||||
[kYToRgb]"r"(&kYToRgb)
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3",
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q4",
|
||||
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
|
||||
);
|
||||
}
|
||||
@ -631,7 +637,7 @@ void NV21ToARGBRow_NEON(const uint8* src_y,
|
||||
[kUVToG]"r"(&kUVToG), // %5
|
||||
[kUVBiasBGR]"r"(&kUVBiasBGR),
|
||||
[kYToRgb]"r"(&kYToRgb)
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3",
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q4",
|
||||
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
|
||||
);
|
||||
}
|
||||
@ -659,7 +665,7 @@ void NV12ToRGB565Row_NEON(const uint8* src_y,
|
||||
[kUVToG]"r"(&kUVToG), // %5
|
||||
[kUVBiasBGR]"r"(&kUVBiasBGR),
|
||||
[kYToRgb]"r"(&kYToRgb)
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3",
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q4",
|
||||
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
|
||||
);
|
||||
}
|
||||
@ -687,7 +693,7 @@ void NV21ToRGB565Row_NEON(const uint8* src_y,
|
||||
[kUVToG]"r"(&kUVToG), // %5
|
||||
[kUVBiasBGR]"r"(&kUVBiasBGR),
|
||||
[kYToRgb]"r"(&kYToRgb)
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3",
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q4",
|
||||
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
|
||||
);
|
||||
}
|
||||
@ -713,7 +719,7 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
|
||||
[kUVToG]"r"(&kUVToG), // %4
|
||||
[kUVBiasBGR]"r"(&kUVBiasBGR),
|
||||
[kYToRgb]"r"(&kYToRgb)
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3",
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q4",
|
||||
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
|
||||
);
|
||||
}
|
||||
@ -739,7 +745,7 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy,
|
||||
[kUVToG]"r"(&kUVToG), // %4
|
||||
[kUVBiasBGR]"r"(&kUVBiasBGR),
|
||||
[kYToRgb]"r"(&kYToRgb)
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3",
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q4",
|
||||
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
|
||||
);
|
||||
}
|
||||
@ -1245,25 +1251,6 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
|
||||
);
|
||||
}
|
||||
|
||||
// Select G channels from ARGB. e.g. GGGGGGGG
|
||||
void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
|
||||
uint32 /*selector*/, int pix) {
|
||||
asm volatile (
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load row 8 pixels.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop
|
||||
MEMACCESS(1)
|
||||
"vst1.8 {d1}, [%1]! \n" // store 8 G's.
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_bayer), // %1
|
||||
"+r"(pix) // %2
|
||||
:
|
||||
: "cc", "memory", "q0", "q1" // Clobber List
|
||||
);
|
||||
}
|
||||
|
||||
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
|
||||
void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
|
||||
const uint8* shuffler, int pix) {
|
||||
@ -1360,6 +1347,30 @@ void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
|
||||
);
|
||||
}
|
||||
|
||||
void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
|
||||
const uint32 dither4, int width) {
|
||||
asm volatile (
|
||||
".p2align 2 \n"
|
||||
"vdup.32 d2, %2 \n" // dither4
|
||||
"1: \n"
|
||||
MEMACCESS(1)
|
||||
"vld4.8 {d20, d21, d22, d23}, [%1]! \n" // load 8 pixels of ARGB.
|
||||
"subs %3, %3, #8 \n" // 8 processed per loop.
|
||||
"vqadd.u8 d20, d20, d2 \n"
|
||||
"vqadd.u8 d21, d21, d2 \n"
|
||||
"vqadd.u8 d22, d22, d2 \n"
|
||||
ARGBTORGB565
|
||||
MEMACCESS(0)
|
||||
"vst1.8 {q0}, [%0]! \n" // store 8 pixels RGB565.
|
||||
"bgt 1b \n"
|
||||
: "+r"(dst_rgb) // %0
|
||||
: "r"(src_argb), // %1
|
||||
"r"(dither4), // %2
|
||||
"r"(width) // %3
|
||||
: "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11"
|
||||
);
|
||||
}
|
||||
|
||||
void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
|
||||
int pix) {
|
||||
asm volatile (
|
||||
|
276
third_party/libyuv/source/row_neon64.cc
vendored
276
third_party/libyuv/source/row_neon64.cc
vendored
@ -178,7 +178,7 @@ void I444ToARGBRow_NEON(const uint8* src_y,
|
||||
"1: \n"
|
||||
READYUV444
|
||||
YUV422TORGB(v22, v21, v20)
|
||||
"subs %4, %4, #8 \n"
|
||||
"subs %w4, %w4, #8 \n"
|
||||
"movi v23.8b, #255 \n" /* A */
|
||||
MEMACCESS(3)
|
||||
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
|
||||
@ -207,7 +207,7 @@ void I422ToARGBRow_NEON(const uint8* src_y,
|
||||
"1: \n"
|
||||
READYUV422
|
||||
YUV422TORGB(v22, v21, v20)
|
||||
"subs %4, %4, #8 \n"
|
||||
"subs %w4, %w4, #8 \n"
|
||||
"movi v23.8b, #255 \n" /* A */
|
||||
MEMACCESS(3)
|
||||
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
|
||||
@ -236,7 +236,7 @@ void I411ToARGBRow_NEON(const uint8* src_y,
|
||||
"1: \n"
|
||||
READYUV411
|
||||
YUV422TORGB(v22, v21, v20)
|
||||
"subs %4, %4, #8 \n"
|
||||
"subs %w4, %w4, #8 \n"
|
||||
"movi v23.8b, #255 \n" /* A */
|
||||
MEMACCESS(3)
|
||||
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
|
||||
@ -265,7 +265,7 @@ void I422ToBGRARow_NEON(const uint8* src_y,
|
||||
"1: \n"
|
||||
READYUV422
|
||||
YUV422TORGB(v21, v22, v23)
|
||||
"subs %4, %4, #8 \n"
|
||||
"subs %w4, %w4, #8 \n"
|
||||
"movi v20.8b, #255 \n" /* A */
|
||||
MEMACCESS(3)
|
||||
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
|
||||
@ -294,7 +294,7 @@ void I422ToABGRRow_NEON(const uint8* src_y,
|
||||
"1: \n"
|
||||
READYUV422
|
||||
YUV422TORGB(v20, v21, v22)
|
||||
"subs %4, %4, #8 \n"
|
||||
"subs %w4, %w4, #8 \n"
|
||||
"movi v23.8b, #255 \n" /* A */
|
||||
MEMACCESS(3)
|
||||
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
|
||||
@ -323,7 +323,7 @@ void I422ToRGBARow_NEON(const uint8* src_y,
|
||||
"1: \n"
|
||||
READYUV422
|
||||
YUV422TORGB(v23, v22, v21)
|
||||
"subs %4, %4, #8 \n"
|
||||
"subs %w4, %w4, #8 \n"
|
||||
"movi v20.8b, #255 \n" /* A */
|
||||
MEMACCESS(3)
|
||||
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
|
||||
@ -352,7 +352,7 @@ void I422ToRGB24Row_NEON(const uint8* src_y,
|
||||
"1: \n"
|
||||
READYUV422
|
||||
YUV422TORGB(v22, v21, v20)
|
||||
"subs %4, %4, #8 \n"
|
||||
"subs %w4, %w4, #8 \n"
|
||||
MEMACCESS(3)
|
||||
"st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n"
|
||||
"b.gt 1b \n"
|
||||
@ -380,7 +380,7 @@ void I422ToRAWRow_NEON(const uint8* src_y,
|
||||
"1: \n"
|
||||
READYUV422
|
||||
YUV422TORGB(v20, v21, v22)
|
||||
"subs %4, %4, #8 \n"
|
||||
"subs %w4, %w4, #8 \n"
|
||||
MEMACCESS(3)
|
||||
"st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n"
|
||||
"b.gt 1b \n"
|
||||
@ -415,7 +415,7 @@ void I422ToRGB565Row_NEON(const uint8* src_y,
|
||||
"1: \n"
|
||||
READYUV422
|
||||
YUV422TORGB(v22, v21, v20)
|
||||
"subs %4, %4, #8 \n"
|
||||
"subs %w4, %w4, #8 \n"
|
||||
ARGBTORGB565
|
||||
MEMACCESS(3)
|
||||
"st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565.
|
||||
@ -453,7 +453,7 @@ void I422ToARGB1555Row_NEON(const uint8* src_y,
|
||||
"1: \n"
|
||||
READYUV422
|
||||
YUV422TORGB(v22, v21, v20)
|
||||
"subs %4, %4, #8 \n"
|
||||
"subs %w4, %w4, #8 \n"
|
||||
"movi v23.8b, #255 \n"
|
||||
ARGBTOARGB1555
|
||||
MEMACCESS(3)
|
||||
@ -494,7 +494,7 @@ void I422ToARGB4444Row_NEON(const uint8* src_y,
|
||||
"1: \n"
|
||||
READYUV422
|
||||
YUV422TORGB(v22, v21, v20)
|
||||
"subs %4, %4, #8 \n"
|
||||
"subs %w4, %w4, #8 \n"
|
||||
"movi v23.8b, #255 \n"
|
||||
ARGBTOARGB4444
|
||||
MEMACCESS(3)
|
||||
@ -513,33 +513,34 @@ void I422ToARGB4444Row_NEON(const uint8* src_y,
|
||||
}
|
||||
#endif // HAS_I422TOARGB4444ROW_NEON
|
||||
|
||||
#ifdef HAS_YTOARGBROW_NEON
|
||||
void YToARGBRow_NEON(const uint8* src_y,
|
||||
uint8* dst_argb,
|
||||
int width) {
|
||||
#ifdef HAS_I400TOARGBROW_NEON
|
||||
void I400ToARGBRow_NEON(const uint8* src_y,
|
||||
uint8* dst_argb,
|
||||
int width) {
|
||||
int64 width64 = (int64)(width);
|
||||
asm volatile (
|
||||
YUV422TORGB_SETUP_REG
|
||||
"1: \n"
|
||||
READYUV400
|
||||
YUV422TORGB(v22, v21, v20)
|
||||
"subs %2, %2, #8 \n"
|
||||
"subs %w2, %w2, #8 \n"
|
||||
"movi v23.8b, #255 \n"
|
||||
MEMACCESS(1)
|
||||
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_y), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
"+r"(width) // %2
|
||||
"+r"(width64) // %2
|
||||
: [kUVBiasBGR]"r"(&kUVBiasBGR),
|
||||
[kYToRgb]"r"(&kYToRgb)
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
|
||||
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
|
||||
);
|
||||
}
|
||||
#endif // HAS_YTOARGBROW_NEON
|
||||
#endif // HAS_I400TOARGBROW_NEON
|
||||
|
||||
#ifdef HAS_I400TOARGBROW_NEON
|
||||
void I400ToARGBRow_NEON(const uint8* src_y,
|
||||
#ifdef HAS_J400TOARGBROW_NEON
|
||||
void J400ToARGBRow_NEON(const uint8* src_y,
|
||||
uint8* dst_argb,
|
||||
int width) {
|
||||
asm volatile (
|
||||
@ -549,7 +550,7 @@ void I400ToARGBRow_NEON(const uint8* src_y,
|
||||
"ld1 {v20.8b}, [%0], #8 \n"
|
||||
"orr v21.8b, v20.8b, v20.8b \n"
|
||||
"orr v22.8b, v20.8b, v20.8b \n"
|
||||
"subs %2, %2, #8 \n"
|
||||
"subs %w2, %w2, #8 \n"
|
||||
MEMACCESS(1)
|
||||
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
|
||||
"b.gt 1b \n"
|
||||
@ -560,7 +561,7 @@ void I400ToARGBRow_NEON(const uint8* src_y,
|
||||
: "cc", "memory", "v20", "v21", "v22", "v23"
|
||||
);
|
||||
}
|
||||
#endif // HAS_I400TOARGBROW_NEON
|
||||
#endif // HAS_J400TOARGBROW_NEON
|
||||
|
||||
#ifdef HAS_NV12TOARGBROW_NEON
|
||||
void NV12ToARGBRow_NEON(const uint8* src_y,
|
||||
@ -572,7 +573,7 @@ void NV12ToARGBRow_NEON(const uint8* src_y,
|
||||
"1: \n"
|
||||
READNV12
|
||||
YUV422TORGB(v22, v21, v20)
|
||||
"subs %3, %3, #8 \n"
|
||||
"subs %w3, %w3, #8 \n"
|
||||
"movi v23.8b, #255 \n"
|
||||
MEMACCESS(2)
|
||||
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
|
||||
@ -599,7 +600,7 @@ void NV21ToARGBRow_NEON(const uint8* src_y,
|
||||
"1: \n"
|
||||
READNV21
|
||||
YUV422TORGB(v22, v21, v20)
|
||||
"subs %3, %3, #8 \n"
|
||||
"subs %w3, %w3, #8 \n"
|
||||
"movi v23.8b, #255 \n"
|
||||
MEMACCESS(2)
|
||||
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
|
||||
@ -626,7 +627,7 @@ void NV12ToRGB565Row_NEON(const uint8* src_y,
|
||||
"1: \n"
|
||||
READNV12
|
||||
YUV422TORGB(v22, v21, v20)
|
||||
"subs %3, %3, #8 \n"
|
||||
"subs %w3, %w3, #8 \n"
|
||||
ARGBTORGB565
|
||||
MEMACCESS(2)
|
||||
"st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565.
|
||||
@ -653,7 +654,7 @@ void NV21ToRGB565Row_NEON(const uint8* src_y,
|
||||
"1: \n"
|
||||
READNV21
|
||||
YUV422TORGB(v22, v21, v20)
|
||||
"subs %3, %3, #8 \n"
|
||||
"subs %w3, %w3, #8 \n"
|
||||
ARGBTORGB565
|
||||
MEMACCESS(2)
|
||||
"st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565.
|
||||
@ -674,19 +675,20 @@ void NV21ToRGB565Row_NEON(const uint8* src_y,
|
||||
void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
|
||||
uint8* dst_argb,
|
||||
int width) {
|
||||
int64 width64 = (int64)(width);
|
||||
asm volatile (
|
||||
YUV422TORGB_SETUP_REG
|
||||
"1: \n"
|
||||
READYUY2
|
||||
YUV422TORGB(v22, v21, v20)
|
||||
"subs %2, %2, #8 \n"
|
||||
"subs %w2, %w2, #8 \n"
|
||||
"movi v23.8b, #255 \n"
|
||||
MEMACCESS(1)
|
||||
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_yuy2), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
"+r"(width) // %2
|
||||
"+r"(width64) // %2
|
||||
: [kUVBiasBGR]"r"(&kUVBiasBGR),
|
||||
[kYToRgb]"r"(&kYToRgb)
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
|
||||
@ -699,19 +701,20 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
|
||||
void UYVYToARGBRow_NEON(const uint8* src_uyvy,
|
||||
uint8* dst_argb,
|
||||
int width) {
|
||||
int64 width64 = (int64)(width);
|
||||
asm volatile (
|
||||
YUV422TORGB_SETUP_REG
|
||||
"1: \n"
|
||||
READUYVY
|
||||
YUV422TORGB(v22, v21, v20)
|
||||
"subs %2, %2, #8 \n"
|
||||
"subs %w2, %w2, #8 \n"
|
||||
"movi v23.8b, #255 \n"
|
||||
MEMACCESS(1)
|
||||
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n"
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_uyvy), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
"+r"(width) // %2
|
||||
"+r"(width64) // %2
|
||||
: [kUVBiasBGR]"r"(&kUVBiasBGR),
|
||||
[kYToRgb]"r"(&kYToRgb)
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
|
||||
@ -728,7 +731,7 @@ void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV
|
||||
"subs %3, %3, #16 \n" // 16 processed per loop
|
||||
"subs %w3, %w3, #16 \n" // 16 processed per loop
|
||||
MEMACCESS(1)
|
||||
"st1 {v0.16b}, [%1], #16 \n" // store U
|
||||
MEMACCESS(2)
|
||||
@ -754,7 +757,7 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
|
||||
"ld1 {v0.16b}, [%0], #16 \n" // load U
|
||||
MEMACCESS(1)
|
||||
"ld1 {v1.16b}, [%1], #16 \n" // load V
|
||||
"subs %3, %3, #16 \n" // 16 processed per loop
|
||||
"subs %w3, %w3, #16 \n" // 16 processed per loop
|
||||
MEMACCESS(2)
|
||||
"st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV
|
||||
"b.gt 1b \n"
|
||||
@ -776,7 +779,7 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32
|
||||
"subs %2, %2, #32 \n" // 32 processed per loop
|
||||
"subs %w2, %w2, #32 \n" // 32 processed per loop
|
||||
MEMACCESS(1)
|
||||
"st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 32
|
||||
"b.gt 1b \n"
|
||||
@ -794,7 +797,7 @@ void SetRow_NEON(uint8* dst, uint8 v8, int count) {
|
||||
asm volatile (
|
||||
"dup v0.16b, %w2 \n" // duplicate 16 bytes
|
||||
"1: \n"
|
||||
"subs %1, %1, #16 \n" // 16 bytes per loop
|
||||
"subs %w1, %w1, #16 \n" // 16 bytes per loop
|
||||
MEMACCESS(0)
|
||||
"st1 {v0.16b}, [%0], #16 \n" // store
|
||||
"b.gt 1b \n"
|
||||
@ -809,7 +812,7 @@ void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
|
||||
asm volatile (
|
||||
"dup v0.4s, %w2 \n" // duplicate 4 ints
|
||||
"1: \n"
|
||||
"subs %1, %1, #4 \n" // 4 ints per loop
|
||||
"subs %w1, %w1, #4 \n" // 4 ints per loop
|
||||
MEMACCESS(0)
|
||||
"st1 {v0.16b}, [%0], #16 \n" // store
|
||||
"b.gt 1b \n"
|
||||
@ -822,6 +825,7 @@ void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
|
||||
|
||||
#ifdef HAS_MIRRORROW_NEON
|
||||
void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
|
||||
int64 width64 = (int64) width;
|
||||
asm volatile (
|
||||
// Start at end of source row.
|
||||
"add %0, %0, %2 \n"
|
||||
@ -830,7 +834,7 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v0.16b}, [%0], %3 \n" // src -= 16
|
||||
"subs %2, %2, #16 \n" // 16 pixels per loop.
|
||||
"subs %2, %2, #16 \n" // 16 pixels per loop.
|
||||
"rev64 v0.16b, v0.16b \n"
|
||||
MEMACCESS(1)
|
||||
"st1 {v0.D}[1], [%1], #8 \n" // dst += 16
|
||||
@ -839,7 +843,7 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(width) // %2
|
||||
"+r"(width64) // %2
|
||||
: "r"((ptrdiff_t)-16) // %3
|
||||
: "cc", "memory", "v0"
|
||||
);
|
||||
@ -849,6 +853,7 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
|
||||
#ifdef HAS_MIRRORUVROW_NEON
|
||||
void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
|
||||
int width) {
|
||||
int64 width64 = (int64) width;
|
||||
asm volatile (
|
||||
// Start at end of source row.
|
||||
"add %0, %0, %3, lsl #1 \n"
|
||||
@ -868,7 +873,7 @@ void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
|
||||
: "+r"(src_uv), // %0
|
||||
"+r"(dst_u), // %1
|
||||
"+r"(dst_v), // %2
|
||||
"+r"(width) // %3
|
||||
"+r"(width64) // %3
|
||||
: "r"((ptrdiff_t)-16) // %4
|
||||
: "cc", "memory", "v0", "v1"
|
||||
);
|
||||
@ -877,6 +882,7 @@ void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
|
||||
|
||||
#ifdef HAS_ARGBMIRRORROW_NEON
|
||||
void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
|
||||
int64 width64 = (int64) width;
|
||||
asm volatile (
|
||||
// Start at end of source row.
|
||||
"add %0, %0, %2, lsl #2 \n"
|
||||
@ -894,7 +900,7 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(width) // %2
|
||||
"+r"(width64) // %2
|
||||
: "r"((ptrdiff_t)-16) // %3
|
||||
: "cc", "memory", "v0"
|
||||
);
|
||||
@ -908,7 +914,7 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
MEMACCESS(1)
|
||||
"st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB pixels
|
||||
"b.gt 1b \n"
|
||||
@ -928,7 +934,7 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"orr v3.8b, v1.8b, v1.8b \n" // move g
|
||||
"orr v4.8b, v0.8b, v0.8b \n" // move r
|
||||
MEMACCESS(1)
|
||||
@ -963,7 +969,7 @@ void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) {
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
RGB565TOARGB
|
||||
MEMACCESS(1)
|
||||
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
|
||||
@ -1022,7 +1028,7 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
ARGB1555TOARGB
|
||||
MEMACCESS(1)
|
||||
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
|
||||
@ -1055,7 +1061,7 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
ARGB4444TOARGB
|
||||
MEMACCESS(1)
|
||||
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
|
||||
@ -1075,7 +1081,7 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB pixels
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
MEMACCESS(1)
|
||||
"st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24.
|
||||
"b.gt 1b \n"
|
||||
@ -1094,7 +1100,7 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"orr v4.8b, v2.8b, v2.8b \n" // mov g
|
||||
"orr v5.8b, v1.8b, v1.8b \n" // mov b
|
||||
MEMACCESS(1)
|
||||
@ -1115,7 +1121,7 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
|
||||
"subs %2, %2, #16 \n" // 16 processed per loop.
|
||||
"subs %w2, %w2, #16 \n" // 16 processed per loop.
|
||||
MEMACCESS(1)
|
||||
"st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y.
|
||||
"b.gt 1b \n"
|
||||
@ -1134,7 +1140,7 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
|
||||
"subs %2, %2, #16 \n" // 16 processed per loop.
|
||||
"subs %w2, %w2, #16 \n" // 16 processed per loop.
|
||||
MEMACCESS(1)
|
||||
"st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
|
||||
"b.gt 1b \n"
|
||||
@ -1154,7 +1160,7 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 pixels
|
||||
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
|
||||
"subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
|
||||
MEMACCESS(1)
|
||||
"st1 {v1.8b}, [%1], #8 \n" // store 8 U.
|
||||
MEMACCESS(2)
|
||||
@ -1177,7 +1183,7 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY pixels
|
||||
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
|
||||
"subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
|
||||
MEMACCESS(1)
|
||||
"st1 {v0.8b}, [%1], #8 \n" // store 8 U.
|
||||
MEMACCESS(2)
|
||||
@ -1201,7 +1207,7 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
|
||||
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
|
||||
"subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
|
||||
MEMACCESS(1)
|
||||
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
|
||||
"urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U
|
||||
@ -1231,7 +1237,7 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
|
||||
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
|
||||
"subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
|
||||
MEMACCESS(1)
|
||||
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
|
||||
"urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U
|
||||
@ -1253,27 +1259,6 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
|
||||
}
|
||||
#endif // HAS_UYVYTOUVROW_NEON
|
||||
|
||||
// Select G channels from ARGB. e.g. GGGGGGGG
|
||||
#ifdef HAS_ARGBTOBAYERGGROW_NEON
|
||||
void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
|
||||
uint32 /*selector*/, int pix) {
|
||||
asm volatile (
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load row 8 pixels
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop
|
||||
MEMACCESS(1)
|
||||
"st1 {v1.8b}, [%1], #8 \n" // store 8 G's.
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_bayer), // %1
|
||||
"+r"(pix) // %2
|
||||
:
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
|
||||
);
|
||||
}
|
||||
#endif // HAS_ARGBTOBAYERGGROW_NEON
|
||||
|
||||
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
|
||||
#ifdef HAS_ARGBSHUFFLEROW_NEON
|
||||
void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
|
||||
@ -1284,7 +1269,7 @@ void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels.
|
||||
"subs %2, %2, #4 \n" // 4 processed per loop
|
||||
"subs %w2, %w2, #4 \n" // 4 processed per loop
|
||||
"tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels
|
||||
MEMACCESS(1)
|
||||
"st1 {v1.16b}, [%1], #16 \n" // store 4.
|
||||
@ -1312,7 +1297,7 @@ void I422ToYUY2Row_NEON(const uint8* src_y,
|
||||
"ld1 {v1.8b}, [%1], #8 \n" // load 8 Us
|
||||
MEMACCESS(2)
|
||||
"ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs
|
||||
"subs %4, %4, #16 \n" // 16 pixels
|
||||
"subs %w4, %w4, #16 \n" // 16 pixels
|
||||
MEMACCESS(3)
|
||||
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
|
||||
"b.gt 1b \n"
|
||||
@ -1341,7 +1326,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y,
|
||||
"ld1 {v0.8b}, [%1], #8 \n" // load 8 Us
|
||||
MEMACCESS(2)
|
||||
"ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs
|
||||
"subs %4, %4, #16 \n" // 16 pixels
|
||||
"subs %w4, %w4, #16 \n" // 16 pixels
|
||||
MEMACCESS(3)
|
||||
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
|
||||
"b.gt 1b \n"
|
||||
@ -1362,7 +1347,7 @@ void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
ARGBTORGB565
|
||||
MEMACCESS(1)
|
||||
"st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565.
|
||||
@ -1376,6 +1361,31 @@ void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
|
||||
}
|
||||
#endif // HAS_ARGBTORGB565ROW_NEON
|
||||
|
||||
#ifdef HAS_ARGBTORGB565DITHERROW_NEON
|
||||
void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
|
||||
const uint32 dither4, int width) {
|
||||
asm volatile (
|
||||
"dup v1.4s, %w2 \n" // dither4
|
||||
"1: \n"
|
||||
MEMACCESS(1)
|
||||
"ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 pixels
|
||||
"subs %w3, %w3, #8 \n" // 8 processed per loop.
|
||||
"uqadd v20.8b, v20.8b, v1.8b \n"
|
||||
"uqadd v21.8b, v21.8b, v1.8b \n"
|
||||
"uqadd v22.8b, v22.8b, v1.8b \n"
|
||||
ARGBTORGB565
|
||||
MEMACCESS(0)
|
||||
"st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565.
|
||||
"b.gt 1b \n"
|
||||
: "+r"(dst_rgb) // %0
|
||||
: "r"(src_argb), // %1
|
||||
"r"(dither4), // %2
|
||||
"r"(width) // %3
|
||||
: "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23"
|
||||
);
|
||||
}
|
||||
#endif // HAS_ARGBTORGB565ROW_NEON
|
||||
|
||||
#ifdef HAS_ARGBTOARGB1555ROW_NEON
|
||||
void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
|
||||
int pix) {
|
||||
@ -1383,7 +1393,7 @@ void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
ARGBTOARGB1555
|
||||
MEMACCESS(1)
|
||||
"st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB1555.
|
||||
@ -1405,7 +1415,7 @@ void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
ARGBTOARGB4444
|
||||
MEMACCESS(1)
|
||||
"st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB4444.
|
||||
@ -1429,7 +1439,7 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"umull v3.8h, v0.8b, v4.8b \n" // B
|
||||
"umlal v3.8h, v1.8b, v5.8b \n" // G
|
||||
"umlal v3.8h, v2.8b, v6.8b \n" // R
|
||||
@ -1456,7 +1466,7 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"umull v3.8h, v0.8b, v4.8b \n" // B
|
||||
"umlal v3.8h, v1.8b, v5.8b \n" // G
|
||||
"umlal v3.8h, v2.8b, v6.8b \n" // R
|
||||
@ -1487,7 +1497,7 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
|
||||
"subs %3, %3, #8 \n" // 8 processed per loop.
|
||||
"subs %w3, %w3, #8 \n" // 8 processed per loop.
|
||||
"umull v4.8h, v0.8b, v24.8b \n" // B
|
||||
"umlsl v4.8h, v1.8b, v25.8b \n" // G
|
||||
"umlsl v4.8h, v2.8b, v26.8b \n" // R
|
||||
@ -1531,7 +1541,7 @@ void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
|
||||
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
|
||||
"uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
|
||||
|
||||
"subs %3, %3, #16 \n" // 16 processed per loop.
|
||||
"subs %w3, %w3, #16 \n" // 16 processed per loop.
|
||||
"mul v3.8h, v0.8h, v20.8h \n" // B
|
||||
"mls v3.8h, v1.8h, v21.8h \n" // G
|
||||
"mls v3.8h, v2.8h, v22.8h \n" // R
|
||||
@ -1587,7 +1597,7 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
|
||||
"urshr v1.8h, v1.8h, #1 \n"
|
||||
"urshr v2.8h, v2.8h, #1 \n"
|
||||
|
||||
"subs %3, %3, #32 \n" // 32 processed per loop.
|
||||
"subs %w3, %w3, #32 \n" // 32 processed per loop.
|
||||
"mul v3.8h, v0.8h, v20.8h \n" // B
|
||||
"mls v3.8h, v1.8h, v21.8h \n" // G
|
||||
"mls v3.8h, v2.8h, v22.8h \n" // R
|
||||
@ -1653,7 +1663,7 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
|
||||
"urshr v1.8h, v1.8h, #1 \n"
|
||||
"urshr v2.8h, v2.8h, #1 \n"
|
||||
|
||||
"subs %4, %4, #16 \n" // 32 processed per loop.
|
||||
"subs %w4, %w4, #16 \n" // 32 processed per loop.
|
||||
RGBTOUV(v0.8h, v1.8h, v2.8h)
|
||||
MEMACCESS(2)
|
||||
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
|
||||
@ -1700,7 +1710,7 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
|
||||
"urshr v1.8h, v1.8h, #1 \n"
|
||||
"urshr v2.8h, v2.8h, #1 \n"
|
||||
|
||||
"subs %4, %4, #16 \n" // 32 processed per loop.
|
||||
"subs %w4, %w4, #16 \n" // 32 processed per loop.
|
||||
RGBTOUV(v0.8h, v1.8h, v2.8h)
|
||||
MEMACCESS(2)
|
||||
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
|
||||
@ -1741,7 +1751,7 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
|
||||
"urshr v1.8h, v3.8h, #1 \n"
|
||||
"urshr v2.8h, v2.8h, #1 \n"
|
||||
|
||||
"subs %4, %4, #16 \n" // 32 processed per loop.
|
||||
"subs %w4, %w4, #16 \n" // 32 processed per loop.
|
||||
RGBTOUV(v0.8h, v1.8h, v2.8h)
|
||||
MEMACCESS(2)
|
||||
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
|
||||
@ -1782,7 +1792,7 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
|
||||
"urshr v2.8h, v2.8h, #1 \n"
|
||||
"urshr v1.8h, v1.8h, #1 \n"
|
||||
|
||||
"subs %4, %4, #16 \n" // 32 processed per loop.
|
||||
"subs %w4, %w4, #16 \n" // 32 processed per loop.
|
||||
RGBTOUV(v0.8h, v2.8h, v1.8h)
|
||||
MEMACCESS(2)
|
||||
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
|
||||
@ -1823,7 +1833,7 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
|
||||
"urshr v1.8h, v1.8h, #1 \n"
|
||||
"urshr v2.8h, v2.8h, #1 \n"
|
||||
|
||||
"subs %4, %4, #16 \n" // 32 processed per loop.
|
||||
"subs %w4, %w4, #16 \n" // 32 processed per loop.
|
||||
RGBTOUV(v0.8h, v1.8h, v2.8h)
|
||||
MEMACCESS(2)
|
||||
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
|
||||
@ -1864,7 +1874,7 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
|
||||
"urshr v1.8h, v1.8h, #1 \n"
|
||||
"urshr v2.8h, v2.8h, #1 \n"
|
||||
|
||||
"subs %4, %4, #16 \n" // 32 processed per loop.
|
||||
"subs %w4, %w4, #16 \n" // 32 processed per loop.
|
||||
RGBTOUV(v0.8h, v1.8h, v2.8h)
|
||||
MEMACCESS(2)
|
||||
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
|
||||
@ -1905,7 +1915,7 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
|
||||
"urshr v1.8h, v1.8h, #1 \n"
|
||||
"urshr v0.8h, v0.8h, #1 \n"
|
||||
|
||||
"subs %4, %4, #16 \n" // 32 processed per loop.
|
||||
"subs %w4, %w4, #16 \n" // 32 processed per loop.
|
||||
RGBTOUV(v2.8h, v1.8h, v0.8h)
|
||||
MEMACCESS(2)
|
||||
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
|
||||
@ -1971,7 +1981,7 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
|
||||
"urshr v5.8h, v18.8h, #1 \n"
|
||||
"urshr v6.8h, v20.8h, #1 \n"
|
||||
|
||||
"subs %4, %4, #16 \n" // 16 processed per loop.
|
||||
"subs %w4, %w4, #16 \n" // 16 processed per loop.
|
||||
"mul v16.8h, v4.8h, v22.8h \n" // B
|
||||
"mls v16.8h, v5.8h, v23.8h \n" // G
|
||||
"mls v16.8h, v6.8h, v24.8h \n" // R
|
||||
@ -2042,7 +2052,7 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
|
||||
"urshr v5.8h, v17.8h, #1 \n"
|
||||
"urshr v6.8h, v18.8h, #1 \n"
|
||||
|
||||
"subs %4, %4, #16 \n" // 16 processed per loop.
|
||||
"subs %w4, %w4, #16 \n" // 16 processed per loop.
|
||||
"mul v2.8h, v4.8h, v20.8h \n" // B
|
||||
"mls v2.8h, v5.8h, v21.8h \n" // G
|
||||
"mls v2.8h, v6.8h, v22.8h \n" // R
|
||||
@ -2113,7 +2123,7 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
|
||||
"urshr v5.8h, v17.8h, #1 \n"
|
||||
"urshr v6.8h, v18.8h, #1 \n"
|
||||
|
||||
"subs %4, %4, #16 \n" // 16 processed per loop.
|
||||
"subs %w4, %w4, #16 \n" // 16 processed per loop.
|
||||
"mul v2.8h, v4.8h, v20.8h \n" // B
|
||||
"mls v2.8h, v5.8h, v21.8h \n" // G
|
||||
"mls v2.8h, v6.8h, v22.8h \n" // R
|
||||
@ -2153,7 +2163,7 @@ void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
RGB565TOARGB
|
||||
"umull v3.8h, v0.8b, v24.8b \n" // B
|
||||
"umlal v3.8h, v1.8b, v25.8b \n" // G
|
||||
@ -2183,7 +2193,7 @@ void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
ARGB1555TOARGB
|
||||
"umull v3.8h, v0.8b, v4.8b \n" // B
|
||||
"umlal v3.8h, v1.8b, v5.8b \n" // G
|
||||
@ -2212,7 +2222,7 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
ARGB4444TOARGB
|
||||
"umull v3.8h, v0.8b, v24.8b \n" // B
|
||||
"umlal v3.8h, v1.8b, v25.8b \n" // G
|
||||
@ -2241,7 +2251,7 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"umull v16.8h, v1.8b, v4.8b \n" // R
|
||||
"umlal v16.8h, v2.8b, v5.8b \n" // G
|
||||
"umlal v16.8h, v3.8b, v6.8b \n" // B
|
||||
@ -2269,7 +2279,7 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"umull v16.8h, v0.8b, v4.8b \n" // R
|
||||
"umlal v16.8h, v1.8b, v5.8b \n" // G
|
||||
"umlal v16.8h, v2.8b, v6.8b \n" // B
|
||||
@ -2297,7 +2307,7 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"umull v16.8h, v1.8b, v4.8b \n" // B
|
||||
"umlal v16.8h, v2.8b, v5.8b \n" // G
|
||||
"umlal v16.8h, v3.8b, v6.8b \n" // R
|
||||
@ -2325,7 +2335,7 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"umull v16.8h, v0.8b, v4.8b \n" // B
|
||||
"umlal v16.8h, v1.8b, v5.8b \n" // G
|
||||
"umlal v16.8h, v2.8b, v6.8b \n" // R
|
||||
@ -2353,7 +2363,7 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"umull v16.8h, v0.8b, v4.8b \n" // B
|
||||
"umlal v16.8h, v1.8b, v5.8b \n" // G
|
||||
"umlal v16.8h, v2.8b, v6.8b \n" // R
|
||||
@ -2380,13 +2390,13 @@ void InterpolateRow_NEON(uint8* dst_ptr,
|
||||
int y0_fraction = 256 - y1_fraction;
|
||||
const uint8* src_ptr1 = src_ptr + src_stride;
|
||||
asm volatile (
|
||||
"cmp %4, #0 \n"
|
||||
"cmp %w4, #0 \n"
|
||||
"b.eq 100f \n"
|
||||
"cmp %4, #64 \n"
|
||||
"cmp %w4, #64 \n"
|
||||
"b.eq 75f \n"
|
||||
"cmp %4, #128 \n"
|
||||
"cmp %w4, #128 \n"
|
||||
"b.eq 50f \n"
|
||||
"cmp %4, #192 \n"
|
||||
"cmp %w4, #192 \n"
|
||||
"b.eq 25f \n"
|
||||
|
||||
"dup v5.16b, %w4 \n"
|
||||
@ -2397,7 +2407,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
|
||||
"ld1 {v0.16b}, [%1], #16 \n"
|
||||
MEMACCESS(2)
|
||||
"ld1 {v1.16b}, [%2], #16 \n"
|
||||
"subs %3, %3, #16 \n"
|
||||
"subs %w3, %w3, #16 \n"
|
||||
"umull v2.8h, v0.8b, v4.8b \n"
|
||||
"umull2 v3.8h, v0.16b, v4.16b \n"
|
||||
"umlal v2.8h, v1.8b, v5.8b \n"
|
||||
@ -2415,7 +2425,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
|
||||
"ld1 {v0.16b}, [%1], #16 \n"
|
||||
MEMACCESS(2)
|
||||
"ld1 {v1.16b}, [%2], #16 \n"
|
||||
"subs %3, %3, #16 \n"
|
||||
"subs %w3, %w3, #16 \n"
|
||||
"urhadd v0.16b, v0.16b, v1.16b \n"
|
||||
"urhadd v0.16b, v0.16b, v1.16b \n"
|
||||
MEMACCESS(0)
|
||||
@ -2429,7 +2439,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
|
||||
"ld1 {v0.16b}, [%1], #16 \n"
|
||||
MEMACCESS(2)
|
||||
"ld1 {v1.16b}, [%2], #16 \n"
|
||||
"subs %3, %3, #16 \n"
|
||||
"subs %w3, %w3, #16 \n"
|
||||
"urhadd v0.16b, v0.16b, v1.16b \n"
|
||||
MEMACCESS(0)
|
||||
"st1 {v0.16b}, [%0], #16 \n"
|
||||
@ -2442,7 +2452,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
|
||||
"ld1 {v1.16b}, [%1], #16 \n"
|
||||
MEMACCESS(2)
|
||||
"ld1 {v0.16b}, [%2], #16 \n"
|
||||
"subs %3, %3, #16 \n"
|
||||
"subs %w3, %w3, #16 \n"
|
||||
"urhadd v0.16b, v0.16b, v1.16b \n"
|
||||
"urhadd v0.16b, v0.16b, v1.16b \n"
|
||||
MEMACCESS(0)
|
||||
@ -2454,7 +2464,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
|
||||
"100: \n"
|
||||
MEMACCESS(1)
|
||||
"ld1 {v0.16b}, [%1], #16 \n"
|
||||
"subs %3, %3, #16 \n"
|
||||
"subs %w3, %w3, #16 \n"
|
||||
MEMACCESS(0)
|
||||
"st1 {v0.16b}, [%0], #16 \n"
|
||||
"b.gt 100b \n"
|
||||
@ -2477,7 +2487,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
|
||||
void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
|
||||
uint8* dst_argb, int width) {
|
||||
asm volatile (
|
||||
"subs %3, %3, #8 \n"
|
||||
"subs %w3, %w3, #8 \n"
|
||||
"b.lt 89f \n"
|
||||
// Blend 8 pixels.
|
||||
"8: \n"
|
||||
@ -2485,7 +2495,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 pixels
|
||||
MEMACCESS(1)
|
||||
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1 pixels
|
||||
"subs %3, %3, #8 \n" // 8 processed per loop.
|
||||
"subs %w3, %w3, #8 \n" // 8 processed per loop.
|
||||
"umull v16.8h, v4.8b, v3.8b \n" // db * a
|
||||
"umull v17.8h, v5.8b, v3.8b \n" // dg * a
|
||||
"umull v18.8h, v6.8b, v3.8b \n" // dr * a
|
||||
@ -2504,7 +2514,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
|
||||
"b.ge 8b \n"
|
||||
|
||||
"89: \n"
|
||||
"adds %3, %3, #8-1 \n"
|
||||
"adds %w3, %w3, #8-1 \n"
|
||||
"b.lt 99f \n"
|
||||
|
||||
// Blend 1 pixels.
|
||||
@ -2513,7 +2523,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
|
||||
"ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0.
|
||||
MEMACCESS(1)
|
||||
"ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1.
|
||||
"subs %3, %3, #1 \n" // 1 processed per loop.
|
||||
"subs %w3, %w3, #1 \n" // 1 processed per loop.
|
||||
"umull v16.8h, v4.8b, v3.8b \n" // db * a
|
||||
"umull v17.8h, v5.8b, v3.8b \n" // dg * a
|
||||
"umull v18.8h, v6.8b, v3.8b \n" // dr * a
|
||||
@ -2552,7 +2562,7 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"umull v4.8h, v0.8b, v3.8b \n" // b * a
|
||||
"umull v5.8h, v1.8b, v3.8b \n" // g * a
|
||||
"umull v6.8h, v2.8b, v3.8b \n" // r * a
|
||||
@ -2586,7 +2596,7 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 pixels of ARGB.
|
||||
"subs %1, %1, #8 \n" // 8 processed per loop.
|
||||
"subs %w1, %w1, #8 \n" // 8 processed per loop.
|
||||
"uxtl v0.8h, v0.8b \n" // b (0 .. 255)
|
||||
"uxtl v1.8h, v1.8b \n"
|
||||
"uxtl v2.8h, v2.8b \n"
|
||||
@ -2630,7 +2640,7 @@ void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB pixels.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"uxtl v4.8h, v4.8b \n" // b (0 .. 255)
|
||||
"uxtl v5.8h, v5.8b \n"
|
||||
"uxtl v6.8h, v6.8b \n"
|
||||
@ -2667,7 +2677,7 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"umull v4.8h, v0.8b, v24.8b \n" // B
|
||||
"umlal v4.8h, v1.8b, v25.8b \n" // G
|
||||
"umlal v4.8h, v2.8b, v26.8b \n" // R
|
||||
@ -2706,7 +2716,7 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels.
|
||||
"subs %1, %1, #8 \n" // 8 processed per loop.
|
||||
"subs %w1, %w1, #8 \n" // 8 processed per loop.
|
||||
"umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B
|
||||
"umlal v4.8h, v1.8b, v21.8b \n" // G
|
||||
"umlal v4.8h, v2.8b, v22.8b \n" // R
|
||||
@ -2746,7 +2756,7 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 pixels.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit
|
||||
"uxtl v17.8h, v17.8b \n" // g
|
||||
"uxtl v18.8h, v18.8b \n" // r
|
||||
@ -2808,7 +2818,7 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
|
||||
MEMACCESS(1)
|
||||
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels.
|
||||
"subs %3, %3, #8 \n" // 8 processed per loop.
|
||||
"subs %w3, %w3, #8 \n" // 8 processed per loop.
|
||||
"umull v0.8h, v0.8b, v4.8b \n" // multiply B
|
||||
"umull v1.8h, v1.8b, v5.8b \n" // multiply G
|
||||
"umull v2.8h, v2.8b, v6.8b \n" // multiply R
|
||||
@ -2842,7 +2852,7 @@ void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
|
||||
MEMACCESS(1)
|
||||
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels.
|
||||
"subs %3, %3, #8 \n" // 8 processed per loop.
|
||||
"subs %w3, %w3, #8 \n" // 8 processed per loop.
|
||||
"uqadd v0.8b, v0.8b, v4.8b \n"
|
||||
"uqadd v1.8b, v1.8b, v5.8b \n"
|
||||
"uqadd v2.8b, v2.8b, v6.8b \n"
|
||||
@ -2872,7 +2882,7 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
|
||||
MEMACCESS(1)
|
||||
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels.
|
||||
"subs %3, %3, #8 \n" // 8 processed per loop.
|
||||
"subs %w3, %w3, #8 \n" // 8 processed per loop.
|
||||
"uqsub v0.8b, v0.8b, v4.8b \n"
|
||||
"uqsub v1.8b, v1.8b, v5.8b \n"
|
||||
"uqsub v2.8b, v2.8b, v6.8b \n"
|
||||
@ -2907,7 +2917,7 @@ void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
|
||||
"ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx.
|
||||
MEMACCESS(1)
|
||||
"ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely.
|
||||
"subs %3, %3, #8 \n" // 8 processed per loop.
|
||||
"subs %w3, %w3, #8 \n" // 8 processed per loop.
|
||||
"uqadd v0.8b, v0.8b, v1.8b \n" // add
|
||||
"orr v1.8b, v0.8b, v0.8b \n"
|
||||
"orr v2.8b, v0.8b, v0.8b \n"
|
||||
@ -2935,7 +2945,7 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
|
||||
"ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx.
|
||||
MEMACCESS(1)
|
||||
"ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely.
|
||||
"subs %3, %3, #16 \n" // 16 processed per loop.
|
||||
"subs %w3, %w3, #16 \n" // 16 processed per loop.
|
||||
"uqadd v0.16b, v0.16b, v1.16b \n" // add
|
||||
MEMACCESS(2)
|
||||
"st1 {v0.16b}, [%2], #16 \n" // store 16 pixels.
|
||||
@ -2966,7 +2976,7 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
|
||||
"ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx.
|
||||
MEMACCESS(1)
|
||||
"ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely.
|
||||
"subs %3, %3, #8 \n" // 8 processed per loop.
|
||||
"subs %w3, %w3, #8 \n" // 8 processed per loop.
|
||||
"uqadd v1.8b, v0.8b, v2.8b \n" // add
|
||||
MEMACCESS(2)
|
||||
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
|
||||
@ -3006,7 +3016,7 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
|
||||
"ld1 {v2.8b}, [%2],%5 \n" // bottom
|
||||
MEMACCESS(2)
|
||||
"ld1 {v3.8b}, [%2],%6 \n"
|
||||
"subs %4, %4, #8 \n" // 8 pixels
|
||||
"subs %w4, %w4, #8 \n" // 8 pixels
|
||||
"usubl v1.8h, v2.8b, v3.8b \n"
|
||||
"add v0.8h, v0.8h, v1.8h \n"
|
||||
"abs v0.8h, v0.8h \n"
|
||||
@ -3019,8 +3029,8 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
|
||||
"+r"(src_y2), // %2
|
||||
"+r"(dst_sobelx), // %3
|
||||
"+r"(width) // %4
|
||||
: "r"(2), // %5
|
||||
"r"(6) // %6
|
||||
: "r"(2LL), // %5
|
||||
"r"(6LL) // %6
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
|
||||
);
|
||||
}
|
||||
@ -3051,7 +3061,7 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
|
||||
"ld1 {v2.8b}, [%0],%5 \n" // right
|
||||
MEMACCESS(1)
|
||||
"ld1 {v3.8b}, [%1],%5 \n"
|
||||
"subs %3, %3, #8 \n" // 8 pixels
|
||||
"subs %w3, %w3, #8 \n" // 8 pixels
|
||||
"usubl v1.8h, v2.8b, v3.8b \n"
|
||||
"add v0.8h, v0.8h, v1.8h \n"
|
||||
"abs v0.8h, v0.8h \n"
|
||||
@ -3063,8 +3073,8 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
|
||||
"+r"(src_y1), // %1
|
||||
"+r"(dst_sobely), // %2
|
||||
"+r"(width) // %3
|
||||
: "r"(1), // %4
|
||||
"r"(6) // %5
|
||||
: "r"(1LL), // %4
|
||||
"r"(6LL) // %5
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
|
||||
);
|
||||
}
|
||||
|
955
third_party/libyuv/source/row_win.cc
vendored
955
third_party/libyuv/source/row_win.cc
vendored
File diff suppressed because it is too large
Load Diff
430
third_party/libyuv/source/scale.cc
vendored
430
third_party/libyuv/source/scale.cc
vendored
@ -23,9 +23,6 @@ namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Remove this macro if OVERREAD is safe.
|
||||
#define AVOID_OVERREAD 1
|
||||
|
||||
static __inline int Abs(int v) {
|
||||
return v >= 0 ? v : -v;
|
||||
}
|
||||
@ -44,9 +41,8 @@ static void ScalePlaneDown2(int src_width, int src_height,
|
||||
int y;
|
||||
void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) =
|
||||
filtering == kFilterNone ? ScaleRowDown2_C :
|
||||
(filtering == kFilterLinear ? ScaleRowDown2Linear_C :
|
||||
ScaleRowDown2Box_C);
|
||||
filtering == kFilterNone ? ScaleRowDown2_C :
|
||||
(filtering == kFilterLinear ? ScaleRowDown2Linear_C : ScaleRowDown2Box_C);
|
||||
int row_stride = src_stride << 1;
|
||||
if (!filtering) {
|
||||
src_ptr += src_stride; // Point to odd rows.
|
||||
@ -54,15 +50,39 @@ static void ScalePlaneDown2(int src_width, int src_height,
|
||||
}
|
||||
|
||||
#if defined(HAS_SCALEROWDOWN2_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) {
|
||||
ScaleRowDown2 = filtering ? ScaleRowDown2Box_NEON : ScaleRowDown2_NEON;
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_NEON :
|
||||
(filtering == kFilterLinear ? ScaleRowDown2Linear_Any_NEON :
|
||||
ScaleRowDown2Box_Any_NEON);
|
||||
if (IS_ALIGNED(dst_width, 16)) {
|
||||
ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_NEON :
|
||||
(filtering == kFilterLinear ? ScaleRowDown2Linear_NEON :
|
||||
ScaleRowDown2Box_NEON);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEROWDOWN2_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {
|
||||
ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSE2 :
|
||||
(filtering == kFilterLinear ? ScaleRowDown2Linear_SSE2 :
|
||||
ScaleRowDown2Box_SSE2);
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_SSE2 :
|
||||
(filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSE2 :
|
||||
ScaleRowDown2Box_Any_SSE2);
|
||||
if (IS_ALIGNED(dst_width, 16)) {
|
||||
ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSE2 :
|
||||
(filtering == kFilterLinear ? ScaleRowDown2Linear_SSE2 :
|
||||
ScaleRowDown2Box_SSE2);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEROWDOWN2_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_AVX2 :
|
||||
(filtering == kFilterLinear ? ScaleRowDown2Linear_Any_AVX2 :
|
||||
ScaleRowDown2Box_Any_AVX2);
|
||||
if (IS_ALIGNED(dst_width, 32)) {
|
||||
ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_AVX2 :
|
||||
(filtering == kFilterLinear ? ScaleRowDown2Linear_AVX2 :
|
||||
ScaleRowDown2Box_AVX2);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEROWDOWN2_MIPS_DSPR2)
|
||||
@ -154,13 +174,30 @@ static void ScalePlaneDown4(int src_width, int src_height,
|
||||
src_stride = 0;
|
||||
}
|
||||
#if defined(HAS_SCALEROWDOWN4_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) {
|
||||
ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON;
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ScaleRowDown4 = filtering ?
|
||||
ScaleRowDown4Box_Any_NEON : ScaleRowDown4_Any_NEON;
|
||||
if (IS_ALIGNED(dst_width, 8)) {
|
||||
ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEROWDOWN4_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
|
||||
ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSE2 : ScaleRowDown4_SSE2;
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
ScaleRowDown4 = filtering ?
|
||||
ScaleRowDown4Box_Any_SSE2 : ScaleRowDown4_Any_SSE2;
|
||||
if (IS_ALIGNED(dst_width, 8)) {
|
||||
ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSE2 : ScaleRowDown4_SSE2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEROWDOWN4_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
ScaleRowDown4 = filtering ?
|
||||
ScaleRowDown4Box_Any_AVX2 : ScaleRowDown4_Any_AVX2;
|
||||
if (IS_ALIGNED(dst_width, 16)) {
|
||||
ScaleRowDown4 = filtering ? ScaleRowDown4Box_AVX2 : ScaleRowDown4_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEROWDOWN4_MIPS_DSPR2)
|
||||
@ -249,24 +286,42 @@ static void ScalePlaneDown34(int src_width, int src_height,
|
||||
ScaleRowDown34_1 = ScaleRowDown34_1_Box_C;
|
||||
}
|
||||
#if defined(HAS_SCALEROWDOWN34_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON) && (dst_width % 24 == 0)) {
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
if (!filtering) {
|
||||
ScaleRowDown34_0 = ScaleRowDown34_NEON;
|
||||
ScaleRowDown34_1 = ScaleRowDown34_NEON;
|
||||
ScaleRowDown34_0 = ScaleRowDown34_Any_NEON;
|
||||
ScaleRowDown34_1 = ScaleRowDown34_Any_NEON;
|
||||
} else {
|
||||
ScaleRowDown34_0 = ScaleRowDown34_0_Box_NEON;
|
||||
ScaleRowDown34_1 = ScaleRowDown34_1_Box_NEON;
|
||||
ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_NEON;
|
||||
ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_NEON;
|
||||
}
|
||||
if (dst_width % 24 == 0) {
|
||||
if (!filtering) {
|
||||
ScaleRowDown34_0 = ScaleRowDown34_NEON;
|
||||
ScaleRowDown34_1 = ScaleRowDown34_NEON;
|
||||
} else {
|
||||
ScaleRowDown34_0 = ScaleRowDown34_0_Box_NEON;
|
||||
ScaleRowDown34_1 = ScaleRowDown34_1_Box_NEON;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEROWDOWN34_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) {
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
if (!filtering) {
|
||||
ScaleRowDown34_0 = ScaleRowDown34_SSSE3;
|
||||
ScaleRowDown34_1 = ScaleRowDown34_SSSE3;
|
||||
ScaleRowDown34_0 = ScaleRowDown34_Any_SSSE3;
|
||||
ScaleRowDown34_1 = ScaleRowDown34_Any_SSSE3;
|
||||
} else {
|
||||
ScaleRowDown34_0 = ScaleRowDown34_0_Box_SSSE3;
|
||||
ScaleRowDown34_1 = ScaleRowDown34_1_Box_SSSE3;
|
||||
ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_SSSE3;
|
||||
ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_SSSE3;
|
||||
}
|
||||
if (dst_width % 24 == 0) {
|
||||
if (!filtering) {
|
||||
ScaleRowDown34_0 = ScaleRowDown34_SSSE3;
|
||||
ScaleRowDown34_1 = ScaleRowDown34_SSSE3;
|
||||
} else {
|
||||
ScaleRowDown34_0 = ScaleRowDown34_0_Box_SSSE3;
|
||||
ScaleRowDown34_1 = ScaleRowDown34_1_Box_SSSE3;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@ -422,23 +477,41 @@ static void ScalePlaneDown38(int src_width, int src_height,
|
||||
ScaleRowDown38_3 = ScaleRowDown38_3_Box_C;
|
||||
ScaleRowDown38_2 = ScaleRowDown38_2_Box_C;
|
||||
}
|
||||
|
||||
#if defined(HAS_SCALEROWDOWN38_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON) && (dst_width % 12 == 0)) {
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
if (!filtering) {
|
||||
ScaleRowDown38_3 = ScaleRowDown38_NEON;
|
||||
ScaleRowDown38_2 = ScaleRowDown38_NEON;
|
||||
ScaleRowDown38_3 = ScaleRowDown38_Any_NEON;
|
||||
ScaleRowDown38_2 = ScaleRowDown38_Any_NEON;
|
||||
} else {
|
||||
ScaleRowDown38_3 = ScaleRowDown38_3_Box_NEON;
|
||||
ScaleRowDown38_2 = ScaleRowDown38_2_Box_NEON;
|
||||
ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_NEON;
|
||||
ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_NEON;
|
||||
}
|
||||
if (dst_width % 12 == 0) {
|
||||
if (!filtering) {
|
||||
ScaleRowDown38_3 = ScaleRowDown38_NEON;
|
||||
ScaleRowDown38_2 = ScaleRowDown38_NEON;
|
||||
} else {
|
||||
ScaleRowDown38_3 = ScaleRowDown38_3_Box_NEON;
|
||||
ScaleRowDown38_2 = ScaleRowDown38_2_Box_NEON;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEROWDOWN38_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) {
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
if (!filtering) {
|
||||
ScaleRowDown38_3 = ScaleRowDown38_Any_SSSE3;
|
||||
ScaleRowDown38_2 = ScaleRowDown38_Any_SSSE3;
|
||||
} else {
|
||||
ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_SSSE3;
|
||||
ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_SSSE3;
|
||||
}
|
||||
if (dst_width % 12 == 0 && !filtering) {
|
||||
ScaleRowDown38_3 = ScaleRowDown38_SSSE3;
|
||||
ScaleRowDown38_2 = ScaleRowDown38_SSSE3;
|
||||
} else {
|
||||
}
|
||||
if (dst_width % 6 == 0 && filtering) {
|
||||
ScaleRowDown38_3 = ScaleRowDown38_3_Box_SSSE3;
|
||||
ScaleRowDown38_2 = ScaleRowDown38_2_Box_SSSE3;
|
||||
}
|
||||
@ -559,65 +632,7 @@ static void ScalePlaneDown38_16(int src_width, int src_height,
|
||||
}
|
||||
}
|
||||
|
||||
static __inline uint32 SumBox(int iboxwidth, int iboxheight,
|
||||
ptrdiff_t src_stride, const uint8* src_ptr) {
|
||||
uint32 sum = 0u;
|
||||
int y;
|
||||
assert(iboxwidth > 0);
|
||||
assert(iboxheight > 0);
|
||||
for (y = 0; y < iboxheight; ++y) {
|
||||
int x;
|
||||
for (x = 0; x < iboxwidth; ++x) {
|
||||
sum += src_ptr[x];
|
||||
}
|
||||
src_ptr += src_stride;
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
static __inline uint32 SumBox_16(int iboxwidth, int iboxheight,
|
||||
ptrdiff_t src_stride, const uint16* src_ptr) {
|
||||
uint32 sum = 0u;
|
||||
int y;
|
||||
assert(iboxwidth > 0);
|
||||
assert(iboxheight > 0);
|
||||
for (y = 0; y < iboxheight; ++y) {
|
||||
int x;
|
||||
for (x = 0; x < iboxwidth; ++x) {
|
||||
sum += src_ptr[x];
|
||||
}
|
||||
src_ptr += src_stride;
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
static void ScalePlaneBoxRow_C(int dst_width, int boxheight,
|
||||
int x, int dx, ptrdiff_t src_stride,
|
||||
const uint8* src_ptr, uint8* dst_ptr) {
|
||||
int i;
|
||||
int boxwidth;
|
||||
for (i = 0; i < dst_width; ++i) {
|
||||
int ix = x >> 16;
|
||||
x += dx;
|
||||
boxwidth = (x >> 16) - ix;
|
||||
*dst_ptr++ = SumBox(boxwidth, boxheight, src_stride, src_ptr + ix) /
|
||||
(boxwidth * boxheight);
|
||||
}
|
||||
}
|
||||
|
||||
static void ScalePlaneBoxRow_16_C(int dst_width, int boxheight,
|
||||
int x, int dx, ptrdiff_t src_stride,
|
||||
const uint16* src_ptr, uint16* dst_ptr) {
|
||||
int i;
|
||||
int boxwidth;
|
||||
for (i = 0; i < dst_width; ++i) {
|
||||
int ix = x >> 16;
|
||||
x += dx;
|
||||
boxwidth = (x >> 16) - ix;
|
||||
*dst_ptr++ = SumBox_16(boxwidth, boxheight, src_stride, src_ptr + ix) /
|
||||
(boxwidth * boxheight);
|
||||
}
|
||||
}
|
||||
#define MIN1(x) ((x) < 1 ? 1 : (x))
|
||||
|
||||
static __inline uint32 SumPixels(int iboxwidth, const uint16* src_ptr) {
|
||||
uint32 sum = 0u;
|
||||
@ -643,15 +658,15 @@ static void ScaleAddCols2_C(int dst_width, int boxheight, int x, int dx,
|
||||
const uint16* src_ptr, uint8* dst_ptr) {
|
||||
int i;
|
||||
int scaletbl[2];
|
||||
int minboxwidth = (dx >> 16);
|
||||
int minboxwidth = dx >> 16;
|
||||
int* scaleptr = scaletbl - minboxwidth;
|
||||
int boxwidth;
|
||||
scaletbl[0] = 65536 / (minboxwidth * boxheight);
|
||||
scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight);
|
||||
scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight);
|
||||
scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight);
|
||||
for (i = 0; i < dst_width; ++i) {
|
||||
int ix = x >> 16;
|
||||
x += dx;
|
||||
boxwidth = (x >> 16) - ix;
|
||||
boxwidth = MIN1((x >> 16) - ix);
|
||||
*dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16;
|
||||
}
|
||||
}
|
||||
@ -660,25 +675,36 @@ static void ScaleAddCols2_16_C(int dst_width, int boxheight, int x, int dx,
|
||||
const uint32* src_ptr, uint16* dst_ptr) {
|
||||
int i;
|
||||
int scaletbl[2];
|
||||
int minboxwidth = (dx >> 16);
|
||||
int minboxwidth = dx >> 16;
|
||||
int* scaleptr = scaletbl - minboxwidth;
|
||||
int boxwidth;
|
||||
scaletbl[0] = 65536 / (minboxwidth * boxheight);
|
||||
scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight);
|
||||
scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight);
|
||||
scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight);
|
||||
for (i = 0; i < dst_width; ++i) {
|
||||
int ix = x >> 16;
|
||||
x += dx;
|
||||
boxwidth = (x >> 16) - ix;
|
||||
*dst_ptr++ = SumPixels_16(boxwidth, src_ptr + ix) *
|
||||
scaleptr[boxwidth] >> 16;
|
||||
boxwidth = MIN1((x >> 16) - ix);
|
||||
*dst_ptr++ =
|
||||
SumPixels_16(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16;
|
||||
}
|
||||
}
|
||||
|
||||
static void ScaleAddCols0_C(int dst_width, int boxheight, int x, int,
|
||||
const uint16* src_ptr, uint8* dst_ptr) {
|
||||
int scaleval = 65536 / boxheight;
|
||||
int i;
|
||||
src_ptr += (x >> 16);
|
||||
for (i = 0; i < dst_width; ++i) {
|
||||
*dst_ptr++ = src_ptr[i] * scaleval >> 16;
|
||||
}
|
||||
}
|
||||
|
||||
static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx,
|
||||
const uint16* src_ptr, uint8* dst_ptr) {
|
||||
int boxwidth = (dx >> 16);
|
||||
int boxwidth = MIN1(dx >> 16);
|
||||
int scaleval = 65536 / (boxwidth * boxheight);
|
||||
int i;
|
||||
x >>= 16;
|
||||
for (i = 0; i < dst_width; ++i) {
|
||||
*dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16;
|
||||
x += boxwidth;
|
||||
@ -687,7 +713,7 @@ static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx,
|
||||
|
||||
static void ScaleAddCols1_16_C(int dst_width, int boxheight, int x, int dx,
|
||||
const uint32* src_ptr, uint16* dst_ptr) {
|
||||
int boxwidth = (dx >> 16);
|
||||
int boxwidth = MIN1(dx >> 16);
|
||||
int scaleval = 65536 / (boxwidth * boxheight);
|
||||
int i;
|
||||
for (i = 0; i < dst_width; ++i) {
|
||||
@ -707,7 +733,7 @@ static void ScalePlaneBox(int src_width, int src_height,
|
||||
int dst_width, int dst_height,
|
||||
int src_stride, int dst_stride,
|
||||
const uint8* src_ptr, uint8* dst_ptr) {
|
||||
int j;
|
||||
int j, k;
|
||||
// Initial source x/y coordinate and step values as 16.16 fixed point.
|
||||
int x = 0;
|
||||
int y = 0;
|
||||
@ -717,10 +743,40 @@ static void ScalePlaneBox(int src_width, int src_height,
|
||||
ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox,
|
||||
&x, &y, &dx, &dy);
|
||||
src_width = Abs(src_width);
|
||||
// TODO(fbarchard): Remove this and make AddRows handle boxheight 1.
|
||||
if (!IS_ALIGNED(src_width, 16) || dst_height * 2 > src_height) {
|
||||
uint8* dst = dst_ptr;
|
||||
int j;
|
||||
{
|
||||
// Allocate a row buffer of uint16.
|
||||
align_buffer_64(row16, src_width * 2);
|
||||
void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
|
||||
const uint16* src_ptr, uint8* dst_ptr) =
|
||||
(dx & 0xffff) ? ScaleAddCols2_C:
|
||||
((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C);
|
||||
void (*ScaleAddRow)(const uint8* src_ptr, uint16* dst_ptr, int src_width) =
|
||||
ScaleAddRow_C;
|
||||
#if defined(HAS_SCALEADDROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
ScaleAddRow = ScaleAddRow_Any_SSE2;
|
||||
if (IS_ALIGNED(src_width, 16)) {
|
||||
ScaleAddRow = ScaleAddRow_SSE2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEADDROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
ScaleAddRow = ScaleAddRow_Any_AVX2;
|
||||
if (IS_ALIGNED(src_width, 32)) {
|
||||
ScaleAddRow = ScaleAddRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEADDROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ScaleAddRow = ScaleAddRow_Any_NEON;
|
||||
if (IS_ALIGNED(src_width, 16)) {
|
||||
ScaleAddRow = ScaleAddRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (j = 0; j < dst_height; ++j) {
|
||||
int boxheight;
|
||||
int iy = y >> 16;
|
||||
@ -729,46 +785,13 @@ static void ScalePlaneBox(int src_width, int src_height,
|
||||
if (y > max_y) {
|
||||
y = max_y;
|
||||
}
|
||||
boxheight = (y >> 16) - iy;
|
||||
ScalePlaneBoxRow_C(dst_width, boxheight,
|
||||
x, dx, src_stride,
|
||||
src, dst);
|
||||
dst += dst_stride;
|
||||
}
|
||||
return;
|
||||
}
|
||||
{
|
||||
// Allocate a row buffer of uint16.
|
||||
align_buffer_64(row16, src_width * 2);
|
||||
void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
|
||||
const uint16* src_ptr, uint8* dst_ptr) =
|
||||
(dx & 0xffff) ? ScaleAddCols2_C: ScaleAddCols1_C;
|
||||
void (*ScaleAddRows)(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint16* dst_ptr, int src_width, int src_height) = ScaleAddRows_C;
|
||||
|
||||
#if defined(HAS_SCALEADDROWS_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)
|
||||
#ifdef AVOID_OVERREAD
|
||||
&& IS_ALIGNED(src_width, 16)
|
||||
#endif
|
||||
) {
|
||||
ScaleAddRows = ScaleAddRows_SSE2;
|
||||
}
|
||||
#endif
|
||||
|
||||
for (j = 0; j < dst_height; ++j) {
|
||||
int boxheight;
|
||||
int iy = y >> 16;
|
||||
const uint8* src = src_ptr + iy * src_stride;
|
||||
y += dy;
|
||||
if (y > (src_height << 16)) {
|
||||
y = (src_height << 16);
|
||||
boxheight = MIN1((y >> 16) - iy);
|
||||
memset(row16, 0, src_width * 2);
|
||||
for (k = 0; k < boxheight; ++k) {
|
||||
ScaleAddRow(src, (uint16 *)(row16), src_width);
|
||||
src += src_stride;
|
||||
}
|
||||
boxheight = (y >> 16) - iy;
|
||||
ScaleAddRows(src, src_stride, (uint16*)(row16),
|
||||
src_width, boxheight);
|
||||
ScaleAddCols(dst_width, boxheight, x, dx, (uint16*)(row16),
|
||||
dst_ptr);
|
||||
ScaleAddCols(dst_width, boxheight, x, dx, (uint16*)(row16), dst_ptr);
|
||||
dst_ptr += dst_stride;
|
||||
}
|
||||
free_aligned_buffer_64(row16);
|
||||
@ -779,7 +802,7 @@ static void ScalePlaneBox_16(int src_width, int src_height,
|
||||
int dst_width, int dst_height,
|
||||
int src_stride, int dst_stride,
|
||||
const uint16* src_ptr, uint16* dst_ptr) {
|
||||
int j;
|
||||
int j, k;
|
||||
// Initial source x/y coordinate and step values as 16.16 fixed point.
|
||||
int x = 0;
|
||||
int y = 0;
|
||||
@ -789,10 +812,21 @@ static void ScalePlaneBox_16(int src_width, int src_height,
|
||||
ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox,
|
||||
&x, &y, &dx, &dy);
|
||||
src_width = Abs(src_width);
|
||||
// TODO(fbarchard): Remove this and make AddRows handle boxheight 1.
|
||||
if (!IS_ALIGNED(src_width, 16) || dst_height * 2 > src_height) {
|
||||
uint16* dst = dst_ptr;
|
||||
int j;
|
||||
{
|
||||
// Allocate a row buffer of uint32.
|
||||
align_buffer_64(row32, src_width * 4);
|
||||
void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
|
||||
const uint32* src_ptr, uint16* dst_ptr) =
|
||||
(dx & 0xffff) ? ScaleAddCols2_16_C: ScaleAddCols1_16_C;
|
||||
void (*ScaleAddRow)(const uint16* src_ptr, uint32* dst_ptr, int src_width) =
|
||||
ScaleAddRow_16_C;
|
||||
|
||||
#if defined(HAS_SCALEADDROW_16_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 16)) {
|
||||
ScaleAddRow = ScaleAddRow_16_SSE2;
|
||||
}
|
||||
#endif
|
||||
|
||||
for (j = 0; j < dst_height; ++j) {
|
||||
int boxheight;
|
||||
int iy = y >> 16;
|
||||
@ -801,46 +835,13 @@ static void ScalePlaneBox_16(int src_width, int src_height,
|
||||
if (y > max_y) {
|
||||
y = max_y;
|
||||
}
|
||||
boxheight = (y >> 16) - iy;
|
||||
ScalePlaneBoxRow_16_C(dst_width, boxheight,
|
||||
x, dx, src_stride,
|
||||
src, dst);
|
||||
dst += dst_stride;
|
||||
}
|
||||
return;
|
||||
}
|
||||
{
|
||||
// Allocate a row buffer of uint32.
|
||||
align_buffer_64(row32, src_width * 4);
|
||||
void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
|
||||
const uint32* src_ptr, uint16* dst_ptr) =
|
||||
(dx & 0xffff) ? ScaleAddCols2_16_C: ScaleAddCols1_16_C;
|
||||
void (*ScaleAddRows)(const uint16* src_ptr, ptrdiff_t src_stride,
|
||||
uint32* dst_ptr, int src_width, int src_height) = ScaleAddRows_16_C;
|
||||
|
||||
#if defined(HAS_SCALEADDROWS_16_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)
|
||||
#ifdef AVOID_OVERREAD
|
||||
&& IS_ALIGNED(src_width, 16)
|
||||
#endif
|
||||
) {
|
||||
ScaleAddRows = ScaleAddRows_16_SSE2;
|
||||
}
|
||||
#endif
|
||||
|
||||
for (j = 0; j < dst_height; ++j) {
|
||||
int boxheight;
|
||||
int iy = y >> 16;
|
||||
const uint16* src = src_ptr + iy * src_stride;
|
||||
y += dy;
|
||||
if (y > (src_height << 16)) {
|
||||
y = (src_height << 16);
|
||||
boxheight = MIN1((y >> 16) - iy);
|
||||
memset(row32, 0, src_width * 4);
|
||||
for (k = 0; k < boxheight; ++k) {
|
||||
ScaleAddRow(src, (uint32 *)(row32), src_width);
|
||||
src += src_stride;
|
||||
}
|
||||
boxheight = (y >> 16) - iy;
|
||||
ScaleAddRows(src, src_stride, (uint32*)(row32),
|
||||
src_width, boxheight);
|
||||
ScaleAddCols(dst_width, boxheight, x, dx, (uint32*)(row32),
|
||||
dst_ptr);
|
||||
ScaleAddCols(dst_width, boxheight, x, dx, (uint32*)(row32), dst_ptr);
|
||||
dst_ptr += dst_stride;
|
||||
}
|
||||
free_aligned_buffer_64(row32);
|
||||
@ -920,6 +921,14 @@ void ScalePlaneBilinearDown(int src_width, int src_height,
|
||||
if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
|
||||
ScaleFilterCols = ScaleFilterCols_SSSE3;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEFILTERCOLS_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON) && src_width < 32768) {
|
||||
ScaleFilterCols = ScaleFilterCols_Any_NEON;
|
||||
if (IS_ALIGNED(dst_width, 8)) {
|
||||
ScaleFilterCols = ScaleFilterCols_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if (y > max_y) {
|
||||
y = max_y;
|
||||
@ -1057,8 +1066,8 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
|
||||
ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
|
||||
InterpolateRow_C;
|
||||
void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
|
||||
int dst_width, int x, int dx) =
|
||||
filtering ? ScaleFilterCols_C : ScaleCols_C;
|
||||
int dst_width, int x, int dx) =
|
||||
filtering ? ScaleFilterCols_C : ScaleCols_C;
|
||||
ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
|
||||
&x, &y, &dx, &dy);
|
||||
src_width = Abs(src_width);
|
||||
@ -1111,6 +1120,14 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
|
||||
if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
|
||||
ScaleFilterCols = ScaleFilterCols_SSSE3;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEFILTERCOLS_NEON)
|
||||
if (filtering && TestCpuFlag(kCpuHasNEON) && src_width < 32768) {
|
||||
ScaleFilterCols = ScaleFilterCols_Any_NEON;
|
||||
if (IS_ALIGNED(dst_width, 8)) {
|
||||
ScaleFilterCols = ScaleFilterCols_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
|
||||
ScaleFilterCols = ScaleColsUp2_C;
|
||||
@ -1129,7 +1146,7 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
|
||||
const uint8* src = src_ptr + yi * src_stride;
|
||||
|
||||
// Allocate 2 row buffers.
|
||||
const int kRowSize = (dst_width + 15) & ~15;
|
||||
const int kRowSize = (dst_width + 31) & ~31;
|
||||
align_buffer_64(row, kRowSize * 2);
|
||||
|
||||
uint8* rowptr = row;
|
||||
@ -1188,8 +1205,8 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height,
|
||||
ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
|
||||
InterpolateRow_16_C;
|
||||
void (*ScaleFilterCols)(uint16* dst_ptr, const uint16* src_ptr,
|
||||
int dst_width, int x, int dx) =
|
||||
filtering ? ScaleFilterCols_16_C : ScaleCols_16_C;
|
||||
int dst_width, int x, int dx) =
|
||||
filtering ? ScaleFilterCols_16_C : ScaleCols_16_C;
|
||||
ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
|
||||
&x, &y, &dx, &dy);
|
||||
src_width = Abs(src_width);
|
||||
@ -1260,7 +1277,7 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height,
|
||||
const uint16* src = src_ptr + yi * src_stride;
|
||||
|
||||
// Allocate 2 row buffers.
|
||||
const int kRowSize = (dst_width + 15) & ~15;
|
||||
const int kRowSize = (dst_width + 31) & ~31;
|
||||
align_buffer_64(row, kRowSize * 4);
|
||||
|
||||
uint16* rowptr = (uint16*)row;
|
||||
@ -1334,8 +1351,7 @@ static void ScalePlaneSimple(int src_width, int src_height,
|
||||
}
|
||||
|
||||
for (i = 0; i < dst_height; ++i) {
|
||||
ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride,
|
||||
dst_width, x, dx);
|
||||
ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx);
|
||||
dst_ptr += dst_stride;
|
||||
y += dy;
|
||||
}
|
||||
@ -1385,8 +1401,7 @@ void ScalePlane(const uint8* src, int src_stride,
|
||||
enum FilterMode filtering) {
|
||||
// Simplify filtering when possible.
|
||||
filtering = ScaleFilterReduce(src_width, src_height,
|
||||
dst_width, dst_height,
|
||||
filtering);
|
||||
dst_width, dst_height, filtering);
|
||||
|
||||
// Negative height means invert the image.
|
||||
if (src_height < 0) {
|
||||
@ -1402,9 +1417,9 @@ void ScalePlane(const uint8* src, int src_stride,
|
||||
CopyPlane(src, src_stride, dst, dst_stride, dst_width, dst_height);
|
||||
return;
|
||||
}
|
||||
if (dst_width == src_width) {
|
||||
if (dst_width == src_width && filtering != kFilterBox) {
|
||||
int dy = FixedDiv(src_height, dst_height);
|
||||
// Arbitrary scale vertically, but unscaled vertically.
|
||||
// Arbitrary scale vertically, but unscaled horizontally.
|
||||
ScalePlaneVertical(src_height,
|
||||
dst_width, dst_height,
|
||||
src_stride, dst_stride, src, dst,
|
||||
@ -1435,7 +1450,7 @@ void ScalePlane(const uint8* src, int src_stride,
|
||||
return;
|
||||
}
|
||||
if (4 * dst_width == src_width && 4 * dst_height == src_height &&
|
||||
filtering != kFilterBilinear) {
|
||||
(filtering == kFilterBox || filtering == kFilterNone)) {
|
||||
// optimized, 1/4
|
||||
ScalePlaneDown4(src_width, src_height, dst_width, dst_height,
|
||||
src_stride, dst_stride, src, dst, filtering);
|
||||
@ -1469,8 +1484,7 @@ void ScalePlane_16(const uint16* src, int src_stride,
|
||||
enum FilterMode filtering) {
|
||||
// Simplify filtering when possible.
|
||||
filtering = ScaleFilterReduce(src_width, src_height,
|
||||
dst_width, dst_height,
|
||||
filtering);
|
||||
dst_width, dst_height, filtering);
|
||||
|
||||
// Negative height means invert the image.
|
||||
if (src_height < 0) {
|
||||
@ -1563,6 +1577,7 @@ int I420Scale(const uint8* src_y, int src_stride_y,
|
||||
int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
|
||||
int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
|
||||
if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
|
||||
src_width > 32768 || src_height > 32768 ||
|
||||
!dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
|
||||
return -1;
|
||||
}
|
||||
@ -1594,6 +1609,7 @@ int I420Scale_16(const uint16* src_y, int src_stride_y,
|
||||
int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
|
||||
int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
|
||||
if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
|
||||
src_width > 32768 || src_height > 32768 ||
|
||||
!dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
|
||||
return -1;
|
||||
}
|
||||
|
200
third_party/libyuv/source/scale_any.cc
vendored
Normal file
200
third_party/libyuv/source/scale_any.cc
vendored
Normal file
@ -0,0 +1,200 @@
|
||||
/*
|
||||
* Copyright 2015 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/scale.h"
|
||||
#include "libyuv/scale_row.h"
|
||||
|
||||
#include "libyuv/basic_types.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols
|
||||
#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK) \
|
||||
void NAMEANY(uint8* dst_ptr, const uint8* src_ptr, \
|
||||
int dst_width, int x, int dx) { \
|
||||
int n = dst_width & ~MASK; \
|
||||
if (n > 0) { \
|
||||
TERP_SIMD(dst_ptr, src_ptr, n, x, dx); \
|
||||
} \
|
||||
TERP_C(dst_ptr + n * BPP, src_ptr, \
|
||||
dst_width & MASK, x + n * dx, dx); \
|
||||
}
|
||||
|
||||
#ifdef HAS_SCALEFILTERCOLS_NEON
|
||||
CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
|
||||
#endif
|
||||
#ifdef HAS_SCALEARGBCOLS_NEON
|
||||
CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7)
|
||||
#endif
|
||||
#ifdef HAS_SCALEARGBFILTERCOLS_NEON
|
||||
CANY(ScaleARGBFilterCols_Any_NEON, ScaleARGBFilterCols_NEON,
|
||||
ScaleARGBFilterCols_C, 4, 3)
|
||||
#endif
|
||||
#undef CANY
|
||||
|
||||
// Fixed scale down.
|
||||
#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \
|
||||
void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, \
|
||||
uint8* dst_ptr, int dst_width) { \
|
||||
int r = (int)((unsigned int)dst_width % (MASK + 1)); \
|
||||
int n = dst_width - r; \
|
||||
if (n > 0) { \
|
||||
SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \
|
||||
} \
|
||||
SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \
|
||||
dst_ptr + n * BPP, r); \
|
||||
}
|
||||
|
||||
#ifdef HAS_SCALEROWDOWN2_SSE2
|
||||
SDANY(ScaleRowDown2_Any_SSE2, ScaleRowDown2_SSE2, ScaleRowDown2_C, 2, 1, 15)
|
||||
SDANY(ScaleRowDown2Linear_Any_SSE2, ScaleRowDown2Linear_SSE2,
|
||||
ScaleRowDown2Linear_C, 2, 1, 15)
|
||||
SDANY(ScaleRowDown2Box_Any_SSE2, ScaleRowDown2Box_SSE2, ScaleRowDown2Box_C,
|
||||
2, 1, 15)
|
||||
#endif
|
||||
#ifdef HAS_SCALEROWDOWN2_AVX2
|
||||
SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31)
|
||||
SDANY(ScaleRowDown2Linear_Any_AVX2, ScaleRowDown2Linear_AVX2,
|
||||
ScaleRowDown2Linear_C, 2, 1, 31)
|
||||
SDANY(ScaleRowDown2Box_Any_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_C,
|
||||
2, 1, 31)
|
||||
#endif
|
||||
#ifdef HAS_SCALEROWDOWN2_NEON
|
||||
SDANY(ScaleRowDown2_Any_NEON, ScaleRowDown2_NEON, ScaleRowDown2_C, 2, 1, 15)
|
||||
SDANY(ScaleRowDown2Linear_Any_NEON, ScaleRowDown2Linear_NEON,
|
||||
ScaleRowDown2Linear_C, 2, 1, 15)
|
||||
SDANY(ScaleRowDown2Box_Any_NEON, ScaleRowDown2Box_NEON,
|
||||
ScaleRowDown2Box_C, 2, 1, 15)
|
||||
#endif
|
||||
#ifdef HAS_SCALEROWDOWN4_SSE2
|
||||
SDANY(ScaleRowDown4_Any_SSE2, ScaleRowDown4_SSE2, ScaleRowDown4_C, 4, 1, 7)
|
||||
SDANY(ScaleRowDown4Box_Any_SSE2, ScaleRowDown4Box_SSE2, ScaleRowDown4Box_C,
|
||||
4, 1, 7)
|
||||
#endif
|
||||
#ifdef HAS_SCALEROWDOWN4_AVX2
|
||||
SDANY(ScaleRowDown4_Any_AVX2, ScaleRowDown4_AVX2, ScaleRowDown4_C, 4, 1, 15)
|
||||
SDANY(ScaleRowDown4Box_Any_AVX2, ScaleRowDown4Box_AVX2, ScaleRowDown4Box_C,
|
||||
4, 1, 15)
|
||||
#endif
|
||||
#ifdef HAS_SCALEROWDOWN4_NEON
|
||||
SDANY(ScaleRowDown4_Any_NEON, ScaleRowDown4_NEON, ScaleRowDown4_C, 4, 1, 7)
|
||||
SDANY(ScaleRowDown4Box_Any_NEON, ScaleRowDown4Box_NEON, ScaleRowDown4Box_C,
|
||||
4, 1, 7)
|
||||
#endif
|
||||
#ifdef HAS_SCALEROWDOWN34_SSSE3
|
||||
SDANY(ScaleRowDown34_Any_SSSE3, ScaleRowDown34_SSSE3,
|
||||
ScaleRowDown34_C, 4 / 3, 1, 23)
|
||||
SDANY(ScaleRowDown34_0_Box_Any_SSSE3, ScaleRowDown34_0_Box_SSSE3,
|
||||
ScaleRowDown34_0_Box_C, 4 / 3, 1, 23)
|
||||
SDANY(ScaleRowDown34_1_Box_Any_SSSE3, ScaleRowDown34_1_Box_SSSE3,
|
||||
ScaleRowDown34_1_Box_C, 4 / 3, 1, 23)
|
||||
#endif
|
||||
#ifdef HAS_SCALEROWDOWN34_NEON
|
||||
SDANY(ScaleRowDown34_Any_NEON, ScaleRowDown34_NEON,
|
||||
ScaleRowDown34_C, 4 / 3, 1, 23)
|
||||
SDANY(ScaleRowDown34_0_Box_Any_NEON, ScaleRowDown34_0_Box_NEON,
|
||||
ScaleRowDown34_0_Box_C, 4 / 3, 1, 23)
|
||||
SDANY(ScaleRowDown34_1_Box_Any_NEON, ScaleRowDown34_1_Box_NEON,
|
||||
ScaleRowDown34_1_Box_C, 4 / 3, 1, 23)
|
||||
#endif
|
||||
#ifdef HAS_SCALEROWDOWN38_SSSE3
|
||||
SDANY(ScaleRowDown38_Any_SSSE3, ScaleRowDown38_SSSE3,
|
||||
ScaleRowDown38_C, 8 / 3, 1, 11)
|
||||
SDANY(ScaleRowDown38_3_Box_Any_SSSE3, ScaleRowDown38_3_Box_SSSE3,
|
||||
ScaleRowDown38_3_Box_C, 8 / 3, 1, 5)
|
||||
SDANY(ScaleRowDown38_2_Box_Any_SSSE3, ScaleRowDown38_2_Box_SSSE3,
|
||||
ScaleRowDown38_2_Box_C, 8 / 3, 1, 5)
|
||||
#endif
|
||||
#ifdef HAS_SCALEROWDOWN38_NEON
|
||||
SDANY(ScaleRowDown38_Any_NEON, ScaleRowDown38_NEON,
|
||||
ScaleRowDown38_C, 8 / 3, 1, 11)
|
||||
SDANY(ScaleRowDown38_3_Box_Any_NEON, ScaleRowDown38_3_Box_NEON,
|
||||
ScaleRowDown38_3_Box_C, 8 / 3, 1, 11)
|
||||
SDANY(ScaleRowDown38_2_Box_Any_NEON, ScaleRowDown38_2_Box_NEON,
|
||||
ScaleRowDown38_2_Box_C, 8 / 3, 1, 11)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEARGBROWDOWN2_SSE2
|
||||
SDANY(ScaleARGBRowDown2_Any_SSE2, ScaleARGBRowDown2_SSE2,
|
||||
ScaleARGBRowDown2_C, 2, 4, 3)
|
||||
SDANY(ScaleARGBRowDown2Linear_Any_SSE2, ScaleARGBRowDown2Linear_SSE2,
|
||||
ScaleARGBRowDown2Linear_C, 2, 4, 3)
|
||||
SDANY(ScaleARGBRowDown2Box_Any_SSE2, ScaleARGBRowDown2Box_SSE2,
|
||||
ScaleARGBRowDown2Box_C, 2, 4, 3)
|
||||
#endif
|
||||
#ifdef HAS_SCALEARGBROWDOWN2_NEON
|
||||
SDANY(ScaleARGBRowDown2_Any_NEON, ScaleARGBRowDown2_NEON,
|
||||
ScaleARGBRowDown2_C, 2, 4, 7)
|
||||
SDANY(ScaleARGBRowDown2Linear_Any_NEON, ScaleARGBRowDown2Linear_NEON,
|
||||
ScaleARGBRowDown2Linear_C, 2, 4, 7)
|
||||
SDANY(ScaleARGBRowDown2Box_Any_NEON, ScaleARGBRowDown2Box_NEON,
|
||||
ScaleARGBRowDown2Box_C, 2, 4, 7)
|
||||
#endif
|
||||
#undef SDANY
|
||||
|
||||
// Scale down by even scale factor.
|
||||
#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK) \
|
||||
void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, int src_stepx, \
|
||||
uint8* dst_ptr, int dst_width) { \
|
||||
int r = (int)((unsigned int)dst_width % (MASK + 1)); \
|
||||
int n = dst_width - r; \
|
||||
if (n > 0) { \
|
||||
SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n); \
|
||||
} \
|
||||
SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride, \
|
||||
src_stepx, dst_ptr + n * BPP, r); \
|
||||
}
|
||||
|
||||
#ifdef HAS_SCALEARGBROWDOWNEVEN_SSE2
|
||||
SDAANY(ScaleARGBRowDownEven_Any_SSE2, ScaleARGBRowDownEven_SSE2,
|
||||
ScaleARGBRowDownEven_C, 4, 3)
|
||||
SDAANY(ScaleARGBRowDownEvenBox_Any_SSE2, ScaleARGBRowDownEvenBox_SSE2,
|
||||
ScaleARGBRowDownEvenBox_C, 4, 3)
|
||||
#endif
|
||||
#ifdef HAS_SCALEARGBROWDOWNEVEN_NEON
|
||||
SDAANY(ScaleARGBRowDownEven_Any_NEON, ScaleARGBRowDownEven_NEON,
|
||||
ScaleARGBRowDownEven_C, 4, 3)
|
||||
SDAANY(ScaleARGBRowDownEvenBox_Any_NEON, ScaleARGBRowDownEvenBox_NEON,
|
||||
ScaleARGBRowDownEvenBox_C, 4, 3)
|
||||
#endif
|
||||
|
||||
// Add rows box filter scale down.
|
||||
#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK) \
|
||||
void NAMEANY(const uint8* src_ptr, uint16* dst_ptr, int src_width) { \
|
||||
int n = src_width & ~MASK; \
|
||||
if (n > 0) { \
|
||||
SCALEADDROW_SIMD(src_ptr, dst_ptr, n); \
|
||||
} \
|
||||
SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK); \
|
||||
}
|
||||
|
||||
#ifdef HAS_SCALEADDROW_SSE2
|
||||
SAANY(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, ScaleAddRow_C, 15)
|
||||
#endif
|
||||
#ifdef HAS_SCALEADDROW_AVX2
|
||||
SAANY(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, ScaleAddRow_C, 31)
|
||||
#endif
|
||||
#ifdef HAS_SCALEADDROW_NEON
|
||||
SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15)
|
||||
#endif
|
||||
#undef SAANY
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
|
||||
|
116
third_party/libyuv/source/scale_argb.cc
vendored
116
third_party/libyuv/source/scale_argb.cc
vendored
@ -53,16 +53,27 @@ static void ScaleARGBDown2(int src_width, int src_height,
|
||||
}
|
||||
|
||||
#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4)) {
|
||||
ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 :
|
||||
(filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 :
|
||||
ScaleARGBRowDown2Box_SSE2);
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_SSE2 :
|
||||
(filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_SSE2 :
|
||||
ScaleARGBRowDown2Box_Any_SSE2);
|
||||
if (IS_ALIGNED(dst_width, 4)) {
|
||||
ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 :
|
||||
(filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 :
|
||||
ScaleARGBRowDown2Box_SSE2);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEARGBROWDOWN2_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) {
|
||||
ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Box_NEON :
|
||||
ScaleARGBRowDown2_NEON;
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_NEON :
|
||||
(filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_NEON :
|
||||
ScaleARGBRowDown2Box_Any_NEON);
|
||||
if (IS_ALIGNED(dst_width, 8)) {
|
||||
ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_NEON :
|
||||
(filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON :
|
||||
ScaleARGBRowDown2Box_NEON);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -86,7 +97,7 @@ static void ScaleARGBDown4Box(int src_width, int src_height,
|
||||
int x, int dx, int y, int dy) {
|
||||
int j;
|
||||
// Allocate 2 rows of ARGB.
|
||||
const int kRowSize = (dst_width * 2 * 4 + 15) & ~15;
|
||||
const int kRowSize = (dst_width * 2 * 4 + 31) & ~31;
|
||||
align_buffer_64(row, kRowSize * 2);
|
||||
int row_stride = src_stride * (dy >> 16);
|
||||
void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
|
||||
@ -96,15 +107,22 @@ static void ScaleARGBDown4Box(int src_width, int src_height,
|
||||
assert(dx == 65536 * 4); // Test scale factor of 4.
|
||||
assert((dy & 0x3ffff) == 0); // Test vertical scale is multiple of 4.
|
||||
#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4)) {
|
||||
ScaleARGBRowDown2 = ScaleARGBRowDown2Box_SSE2;
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_SSE2;
|
||||
if (IS_ALIGNED(dst_width, 4)) {
|
||||
ScaleARGBRowDown2 = ScaleARGBRowDown2Box_SSE2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEARGBROWDOWN2_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) {
|
||||
ScaleARGBRowDown2 = ScaleARGBRowDown2Box_NEON;
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_NEON;
|
||||
if (IS_ALIGNED(dst_width, 8)) {
|
||||
ScaleARGBRowDown2 = ScaleARGBRowDown2Box_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (j = 0; j < dst_height; ++j) {
|
||||
ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2);
|
||||
ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride,
|
||||
@ -135,15 +153,23 @@ static void ScaleARGBDownEven(int src_width, int src_height,
|
||||
assert(IS_ALIGNED(src_height, 2));
|
||||
src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
|
||||
#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4)) {
|
||||
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 :
|
||||
ScaleARGBRowDownEven_SSE2;
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2 :
|
||||
ScaleARGBRowDownEven_Any_SSE2;
|
||||
if (IS_ALIGNED(dst_width, 4)) {
|
||||
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 :
|
||||
ScaleARGBRowDownEven_SSE2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 4)) {
|
||||
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON :
|
||||
ScaleARGBRowDownEven_NEON;
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_NEON :
|
||||
ScaleARGBRowDownEven_Any_NEON;
|
||||
if (IS_ALIGNED(dst_width, 4)) {
|
||||
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON :
|
||||
ScaleARGBRowDownEven_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -229,6 +255,14 @@ static void ScaleARGBBilinearDown(int src_width, int src_height,
|
||||
if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
|
||||
ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
|
||||
if (IS_ALIGNED(dst_width, 4)) {
|
||||
ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
// TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
|
||||
// Allocate a row of ARGB.
|
||||
@ -321,10 +355,26 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
|
||||
ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
|
||||
if (filtering && TestCpuFlag(kCpuHasNEON)) {
|
||||
ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
|
||||
if (IS_ALIGNED(dst_width, 4)) {
|
||||
ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEARGBCOLS_SSE2)
|
||||
if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
|
||||
ScaleARGBFilterCols = ScaleARGBCols_SSE2;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEARGBCOLS_NEON)
|
||||
if (!filtering && TestCpuFlag(kCpuHasNEON)) {
|
||||
ScaleARGBFilterCols = ScaleARGBCols_Any_NEON;
|
||||
if (IS_ALIGNED(dst_width, 8)) {
|
||||
ScaleARGBFilterCols = ScaleARGBCols_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
|
||||
ScaleARGBFilterCols = ScaleARGBColsUp2_C;
|
||||
@ -344,7 +394,7 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
|
||||
const uint8* src = src_argb + yi * src_stride;
|
||||
|
||||
// Allocate 2 rows of ARGB.
|
||||
const int kRowSize = (dst_width * 4 + 15) & ~15;
|
||||
const int kRowSize = (dst_width * 4 + 31) & ~31;
|
||||
align_buffer_64(row, kRowSize * 2);
|
||||
|
||||
uint8* rowptr = row;
|
||||
@ -495,10 +545,26 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
|
||||
ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
|
||||
if (filtering && TestCpuFlag(kCpuHasNEON)) {
|
||||
ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
|
||||
if (IS_ALIGNED(dst_width, 4)) {
|
||||
ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEARGBCOLS_SSE2)
|
||||
if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
|
||||
ScaleARGBFilterCols = ScaleARGBCols_SSE2;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEARGBCOLS_NEON)
|
||||
if (!filtering && TestCpuFlag(kCpuHasNEON)) {
|
||||
ScaleARGBFilterCols = ScaleARGBCols_Any_NEON;
|
||||
if (IS_ALIGNED(dst_width, 8)) {
|
||||
ScaleARGBFilterCols = ScaleARGBCols_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
|
||||
ScaleARGBFilterCols = ScaleARGBColsUp2_C;
|
||||
@ -521,7 +587,7 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
|
||||
const uint8* src_row_v = src_v + uv_yi * src_stride_v;
|
||||
|
||||
// Allocate 2 rows of ARGB.
|
||||
const int kRowSize = (dst_width * 4 + 15) & ~15;
|
||||
const int kRowSize = (dst_width * 4 + 31) & ~31;
|
||||
align_buffer_64(row, kRowSize * 2);
|
||||
|
||||
// Allocate 1 row of ARGB for source conversion.
|
||||
@ -606,6 +672,14 @@ static void ScaleARGBSimple(int src_width, int src_height,
|
||||
if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
|
||||
ScaleARGBCols = ScaleARGBCols_SSE2;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEARGBCOLS_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ScaleARGBCols = ScaleARGBCols_Any_NEON;
|
||||
if (IS_ALIGNED(dst_width, 8)) {
|
||||
ScaleARGBCols = ScaleARGBCols_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if (src_width * 2 == dst_width && x < 0x8000) {
|
||||
ScaleARGBCols = ScaleARGBColsUp2_C;
|
||||
@ -744,6 +818,7 @@ int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,
|
||||
if (!src_argb || src_width == 0 || src_height == 0 ||
|
||||
!dst_argb || dst_width <= 0 || dst_height <= 0 ||
|
||||
clip_x < 0 || clip_y < 0 ||
|
||||
clip_width > 32768 || clip_height > 32768 ||
|
||||
(clip_x + clip_width) > dst_width ||
|
||||
(clip_y + clip_height) > dst_height) {
|
||||
return -1;
|
||||
@ -762,6 +837,7 @@ int ARGBScale(const uint8* src_argb, int src_stride_argb,
|
||||
int dst_width, int dst_height,
|
||||
enum FilterMode filtering) {
|
||||
if (!src_argb || src_width == 0 || src_height == 0 ||
|
||||
src_width > 32768 || src_height > 32768 ||
|
||||
!dst_argb || dst_width <= 0 || dst_height <= 0) {
|
||||
return -1;
|
||||
}
|
||||
|
48
third_party/libyuv/source/scale_common.cc
vendored
48
third_party/libyuv/source/scale_common.cc
vendored
@ -621,39 +621,31 @@ void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
|
||||
}
|
||||
}
|
||||
|
||||
void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint16* dst_ptr, int src_width, int src_height) {
|
||||
void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
|
||||
int x;
|
||||
assert(src_width > 0);
|
||||
assert(src_height > 0);
|
||||
for (x = 0; x < src_width; ++x) {
|
||||
const uint8* s = src_ptr + x;
|
||||
unsigned int sum = 0u;
|
||||
int y;
|
||||
for (y = 0; y < src_height; ++y) {
|
||||
sum += s[0];
|
||||
s += src_stride;
|
||||
}
|
||||
// TODO(fbarchard): Consider limitting height to 256 to avoid overflow.
|
||||
dst_ptr[x] = sum < 65535u ? sum : 65535u;
|
||||
for (x = 0; x < src_width - 1; x += 2) {
|
||||
dst_ptr[0] += src_ptr[0];
|
||||
dst_ptr[1] += src_ptr[1];
|
||||
src_ptr += 2;
|
||||
dst_ptr += 2;
|
||||
}
|
||||
if (src_width & 1) {
|
||||
dst_ptr[0] += src_ptr[0];
|
||||
}
|
||||
}
|
||||
|
||||
void ScaleAddRows_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
|
||||
uint32* dst_ptr, int src_width, int src_height) {
|
||||
void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width) {
|
||||
int x;
|
||||
assert(src_width > 0);
|
||||
assert(src_height > 0);
|
||||
for (x = 0; x < src_width; ++x) {
|
||||
const uint16* s = src_ptr + x;
|
||||
unsigned int sum = 0u;
|
||||
int y;
|
||||
for (y = 0; y < src_height; ++y) {
|
||||
sum += s[0];
|
||||
s += src_stride;
|
||||
}
|
||||
// No risk of overflow here now
|
||||
dst_ptr[x] = sum;
|
||||
for (x = 0; x < src_width - 1; x += 2) {
|
||||
dst_ptr[0] += src_ptr[0];
|
||||
dst_ptr[1] += src_ptr[1];
|
||||
src_ptr += 2;
|
||||
dst_ptr += 2;
|
||||
}
|
||||
if (src_width & 1) {
|
||||
dst_ptr[0] += src_ptr[0];
|
||||
}
|
||||
}
|
||||
|
||||
@ -1030,10 +1022,6 @@ enum FilterMode ScaleFilterReduce(int src_width, int src_height,
|
||||
if (dst_width * 2 >= src_width && dst_height * 2 >= src_height) {
|
||||
filtering = kFilterBilinear;
|
||||
}
|
||||
// If scaling to larger, switch from Box to Bilinear.
|
||||
if (dst_width >= src_width || dst_height >= src_height) {
|
||||
filtering = kFilterBilinear;
|
||||
}
|
||||
}
|
||||
if (filtering == kFilterBilinear) {
|
||||
if (src_height == 1) {
|
||||
|
@ -573,44 +573,38 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
|
||||
);
|
||||
}
|
||||
|
||||
// Reads 16xN bytes and produces 16 shorts at a time.
|
||||
void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint16* dst_ptr, int src_width, int src_height) {
|
||||
int tmp_height = 0;
|
||||
intptr_t tmp_src = 0;
|
||||
asm volatile (
|
||||
"mov %0,%3 \n" // row pointer
|
||||
"mov %5,%2 \n" // height
|
||||
"pxor %%xmm0,%%xmm0 \n" // clear accumulators
|
||||
"pxor %%xmm1,%%xmm1 \n"
|
||||
"pxor %%xmm4,%%xmm4 \n"
|
||||
"sub $0x1,%5 \n"
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
|
||||
"mov %0,%3 \n"
|
||||
"add %6,%0 \n"
|
||||
"movdqa %%xmm0,%%xmm1 \n"
|
||||
"punpcklbw %%xmm4,%%xmm0 \n"
|
||||
"punpckhbw %%xmm4,%%xmm1 \n"
|
||||
"mov %5,%2 \n"
|
||||
"test %2,%2 \n"
|
||||
"je 3f \n"
|
||||
|
||||
LABELALIGN
|
||||
"2: \n"
|
||||
"movdqu " MEMACCESS(0) ",%%xmm2 \n"
|
||||
"add %6,%0 \n"
|
||||
"movdqu " MEMACCESS(3) ",%%xmm2 \n"
|
||||
"add %6,%3 \n"
|
||||
"movdqa %%xmm2,%%xmm3 \n"
|
||||
"punpcklbw %%xmm4,%%xmm2 \n"
|
||||
"punpckhbw %%xmm4,%%xmm3 \n"
|
||||
"paddusw %%xmm2,%%xmm0 \n"
|
||||
"paddusw %%xmm3,%%xmm1 \n"
|
||||
"sub $0x1,%2 \n"
|
||||
"jg 2b \n"
|
||||
"jg 1b \n"
|
||||
|
||||
LABELALIGN
|
||||
"3: \n"
|
||||
"movdqu %%xmm0," MEMACCESS(1) " \n"
|
||||
"movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
|
||||
"lea " MEMLEA(0x10,3) ",%0 \n"
|
||||
"lea " MEMLEA(0x20,1) ",%1 \n"
|
||||
"lea " MEMLEA(0x10,0) ",%0 \n" // src_ptr += 16
|
||||
"mov %0,%3 \n" // row pointer
|
||||
"mov %5,%2 \n" // height
|
||||
"pxor %%xmm0,%%xmm0 \n" // clear accumulators
|
||||
"pxor %%xmm1,%%xmm1 \n"
|
||||
"sub $0x10,%4 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
@ -799,8 +793,7 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
|
||||
// Reads 4 pixels at a time.
|
||||
// Alignment requirement: dst_argb 16 byte aligned.
|
||||
void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
|
||||
int src_stepx,
|
||||
uint8* dst_argb, int dst_width) {
|
||||
int src_stepx, uint8* dst_argb, int dst_width) {
|
||||
intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
|
||||
intptr_t src_stepx_x12 = 0;
|
||||
asm volatile (
|
272
third_party/libyuv/source/scale_neon.cc
vendored
272
third_party/libyuv/source/scale_neon.cc
vendored
@ -43,6 +43,30 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
);
|
||||
}
|
||||
|
||||
// Read 32x1 average down and write 16x1.
|
||||
void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst, int dst_width) {
|
||||
asm volatile (
|
||||
".p2align 2 \n"
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"vld1.8 {q0, q1}, [%0]! \n" // load pixels and post inc
|
||||
"subs %2, %2, #16 \n" // 16 processed per loop
|
||||
"vpaddl.u8 q0, q0 \n" // add adjacent
|
||||
"vpaddl.u8 q1, q1 \n"
|
||||
"vrshrn.u16 d0, q0, #1 \n" // downshift, round and pack
|
||||
"vrshrn.u16 d1, q1, #1 \n"
|
||||
MEMACCESS(1)
|
||||
"vst1.8 {q0}, [%1]! \n"
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(dst_width) // %2
|
||||
:
|
||||
: "q0", "q1" // Clobber List
|
||||
);
|
||||
}
|
||||
|
||||
// Read 32x2 average down and write 16x1.
|
||||
void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst, int dst_width) {
|
||||
@ -517,6 +541,112 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
|
||||
);
|
||||
}
|
||||
|
||||
void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint16* dst_ptr, int src_width, int src_height) {
|
||||
const uint8* src_tmp = NULL;
|
||||
asm volatile (
|
||||
".p2align 2 \n"
|
||||
"1: \n"
|
||||
"mov %0, %1 \n"
|
||||
"mov r12, %5 \n"
|
||||
"veor q2, q2, q2 \n"
|
||||
"veor q3, q3, q3 \n"
|
||||
"2: \n"
|
||||
// load 16 pixels into q0
|
||||
MEMACCESS(0)
|
||||
"vld1.8 {q0}, [%0], %3 \n"
|
||||
"vaddw.u8 q3, q3, d1 \n"
|
||||
"vaddw.u8 q2, q2, d0 \n"
|
||||
"subs r12, r12, #1 \n"
|
||||
"bgt 2b \n"
|
||||
MEMACCESS(2)
|
||||
"vst1.16 {q2, q3}, [%2]! \n" // store pixels
|
||||
"add %1, %1, #16 \n"
|
||||
"subs %4, %4, #16 \n" // 16 processed per loop
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_tmp), // %0
|
||||
"+r"(src_ptr), // %1
|
||||
"+r"(dst_ptr), // %2
|
||||
"+r"(src_stride), // %3
|
||||
"+r"(src_width), // %4
|
||||
"+r"(src_height) // %5
|
||||
:
|
||||
: "memory", "cc", "r12", "q0", "q1", "q2", "q3" // Clobber List
|
||||
);
|
||||
}
|
||||
|
||||
// TODO(Yang Zhang): Investigate less load instructions for
|
||||
// the x/dx stepping
|
||||
#define LOAD2_DATA8_LANE(n) \
|
||||
"lsr %5, %3, #16 \n" \
|
||||
"add %6, %1, %5 \n" \
|
||||
"add %3, %3, %4 \n" \
|
||||
MEMACCESS(6) \
|
||||
"vld2.8 {d6["#n"], d7["#n"]}, [%6] \n"
|
||||
|
||||
void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
|
||||
int dst_width, int x, int dx) {
|
||||
int dx_offset[4] = {0, 1, 2, 3};
|
||||
int* tmp = dx_offset;
|
||||
const uint8* src_tmp = src_ptr;
|
||||
asm volatile (
|
||||
".p2align 2 \n"
|
||||
"vdup.32 q0, %3 \n" // x
|
||||
"vdup.32 q1, %4 \n" // dx
|
||||
"vld1.32 {q2}, [%5] \n" // 0 1 2 3
|
||||
"vshl.i32 q3, q1, #2 \n" // 4 * dx
|
||||
"vmul.s32 q1, q1, q2 \n"
|
||||
// x , x + 1 * dx, x + 2 * dx, x + 3 * dx
|
||||
"vadd.s32 q1, q1, q0 \n"
|
||||
// x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
|
||||
"vadd.s32 q2, q1, q3 \n"
|
||||
"vshl.i32 q0, q3, #1 \n" // 8 * dx
|
||||
"1: \n"
|
||||
LOAD2_DATA8_LANE(0)
|
||||
LOAD2_DATA8_LANE(1)
|
||||
LOAD2_DATA8_LANE(2)
|
||||
LOAD2_DATA8_LANE(3)
|
||||
LOAD2_DATA8_LANE(4)
|
||||
LOAD2_DATA8_LANE(5)
|
||||
LOAD2_DATA8_LANE(6)
|
||||
LOAD2_DATA8_LANE(7)
|
||||
"vmov q10, q1 \n"
|
||||
"vmov q11, q2 \n"
|
||||
"vuzp.16 q10, q11 \n"
|
||||
"vmovl.u8 q8, d6 \n"
|
||||
"vmovl.u8 q9, d7 \n"
|
||||
"vsubl.s16 q11, d18, d16 \n"
|
||||
"vsubl.s16 q12, d19, d17 \n"
|
||||
"vmovl.u16 q13, d20 \n"
|
||||
"vmovl.u16 q10, d21 \n"
|
||||
"vmul.s32 q11, q11, q13 \n"
|
||||
"vmul.s32 q12, q12, q10 \n"
|
||||
"vshrn.s32 d18, q11, #16 \n"
|
||||
"vshrn.s32 d19, q12, #16 \n"
|
||||
"vadd.s16 q8, q8, q9 \n"
|
||||
"vmovn.s16 d6, q8 \n"
|
||||
|
||||
MEMACCESS(0)
|
||||
"vst1.8 {d6}, [%0]! \n" // store pixels
|
||||
"vadd.s32 q1, q1, q0 \n"
|
||||
"vadd.s32 q2, q2, q0 \n"
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop
|
||||
"bgt 1b \n"
|
||||
: "+r"(dst_ptr), // %0
|
||||
"+r"(src_ptr), // %1
|
||||
"+r"(dst_width), // %2
|
||||
"+r"(x), // %3
|
||||
"+r"(dx), // %4
|
||||
"+r"(tmp), // %5
|
||||
"+r"(src_tmp) // %6
|
||||
:
|
||||
: "memory", "cc", "q0", "q1", "q2", "q3",
|
||||
"q8", "q9", "q10", "q11", "q12", "q13"
|
||||
);
|
||||
}
|
||||
|
||||
#undef LOAD2_DATA8_LANE
|
||||
|
||||
// 16x2 -> 16x1
|
||||
void ScaleFilterRows_NEON(uint8* dst_ptr,
|
||||
const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
@ -640,6 +770,35 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
);
|
||||
}
|
||||
|
||||
void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
|
||||
uint8* dst_argb, int dst_width) {
|
||||
asm volatile (
|
||||
".p2align 2 \n"
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
|
||||
MEMACCESS(0)
|
||||
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop
|
||||
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
|
||||
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
|
||||
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
|
||||
"vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
|
||||
"vrshrn.u16 d0, q0, #1 \n" // downshift, round and pack
|
||||
"vrshrn.u16 d1, q1, #1 \n"
|
||||
"vrshrn.u16 d2, q2, #1 \n"
|
||||
"vrshrn.u16 d3, q3, #1 \n"
|
||||
MEMACCESS(1)
|
||||
"vst4.8 {d0, d1, d2, d3}, [%1]! \n"
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
"+r"(dst_width) // %2
|
||||
:
|
||||
: "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
|
||||
);
|
||||
}
|
||||
|
||||
void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst, int dst_width) {
|
||||
asm volatile (
|
||||
@ -757,6 +916,119 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
|
||||
);
|
||||
}
|
||||
|
||||
// TODO(Yang Zhang): Investigate less load instructions for
|
||||
// the x/dx stepping
|
||||
#define LOAD1_DATA32_LANE(dn, n) \
|
||||
"lsr %5, %3, #16 \n" \
|
||||
"add %6, %1, %5, lsl #2 \n" \
|
||||
"add %3, %3, %4 \n" \
|
||||
MEMACCESS(6) \
|
||||
"vld1.32 {"#dn"["#n"]}, [%6] \n"
|
||||
|
||||
void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
|
||||
int dst_width, int x, int dx) {
|
||||
int tmp = 0;
|
||||
const uint8* src_tmp = src_argb;
|
||||
asm volatile (
|
||||
".p2align 2 \n"
|
||||
"1: \n"
|
||||
LOAD1_DATA32_LANE(d0, 0)
|
||||
LOAD1_DATA32_LANE(d0, 1)
|
||||
LOAD1_DATA32_LANE(d1, 0)
|
||||
LOAD1_DATA32_LANE(d1, 1)
|
||||
LOAD1_DATA32_LANE(d2, 0)
|
||||
LOAD1_DATA32_LANE(d2, 1)
|
||||
LOAD1_DATA32_LANE(d3, 0)
|
||||
LOAD1_DATA32_LANE(d3, 1)
|
||||
|
||||
MEMACCESS(0)
|
||||
"vst1.32 {q0, q1}, [%0]! \n" // store pixels
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop
|
||||
"bgt 1b \n"
|
||||
: "+r"(dst_argb), // %0
|
||||
"+r"(src_argb), // %1
|
||||
"+r"(dst_width), // %2
|
||||
"+r"(x), // %3
|
||||
"+r"(dx), // %4
|
||||
"+r"(tmp), // %5
|
||||
"+r"(src_tmp) // %6
|
||||
:
|
||||
: "memory", "cc", "q0", "q1"
|
||||
);
|
||||
}
|
||||
|
||||
#undef LOAD1_DATA32_LANE
|
||||
|
||||
// TODO(Yang Zhang): Investigate less load instructions for
|
||||
// the x/dx stepping
|
||||
#define LOAD2_DATA32_LANE(dn1, dn2, n) \
|
||||
"lsr %5, %3, #16 \n" \
|
||||
"add %6, %1, %5, lsl #2 \n" \
|
||||
"add %3, %3, %4 \n" \
|
||||
MEMACCESS(6) \
|
||||
"vld2.32 {"#dn1"["#n"], "#dn2"["#n"]}, [%6] \n"
|
||||
|
||||
void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
|
||||
int dst_width, int x, int dx) {
|
||||
int dx_offset[4] = {0, 1, 2, 3};
|
||||
int* tmp = dx_offset;
|
||||
const uint8* src_tmp = src_argb;
|
||||
asm volatile (
|
||||
".p2align 2 \n"
|
||||
"vdup.32 q0, %3 \n" // x
|
||||
"vdup.32 q1, %4 \n" // dx
|
||||
"vld1.32 {q2}, [%5] \n" // 0 1 2 3
|
||||
"vshl.i32 q9, q1, #2 \n" // 4 * dx
|
||||
"vmul.s32 q1, q1, q2 \n"
|
||||
"vmov.i8 q3, #0x7f \n" // 0x7F
|
||||
"vmov.i16 q15, #0x7f \n" // 0x7F
|
||||
// x , x + 1 * dx, x + 2 * dx, x + 3 * dx
|
||||
"vadd.s32 q8, q1, q0 \n"
|
||||
"1: \n"
|
||||
// d0, d1: a
|
||||
// d2, d3: b
|
||||
LOAD2_DATA32_LANE(d0, d2, 0)
|
||||
LOAD2_DATA32_LANE(d0, d2, 1)
|
||||
LOAD2_DATA32_LANE(d1, d3, 0)
|
||||
LOAD2_DATA32_LANE(d1, d3, 1)
|
||||
"vshrn.i32 d22, q8, #9 \n"
|
||||
"vand.16 d22, d22, d30 \n"
|
||||
"vdup.8 d24, d22[0] \n"
|
||||
"vdup.8 d25, d22[2] \n"
|
||||
"vdup.8 d26, d22[4] \n"
|
||||
"vdup.8 d27, d22[6] \n"
|
||||
"vext.8 d4, d24, d25, #4 \n"
|
||||
"vext.8 d5, d26, d27, #4 \n" // f
|
||||
"veor.8 q10, q2, q3 \n" // 0x7f ^ f
|
||||
"vmull.u8 q11, d0, d20 \n"
|
||||
"vmull.u8 q12, d1, d21 \n"
|
||||
"vmull.u8 q13, d2, d4 \n"
|
||||
"vmull.u8 q14, d3, d5 \n"
|
||||
"vadd.i16 q11, q11, q13 \n"
|
||||
"vadd.i16 q12, q12, q14 \n"
|
||||
"vshrn.i16 d0, q11, #7 \n"
|
||||
"vshrn.i16 d1, q12, #7 \n"
|
||||
|
||||
MEMACCESS(0)
|
||||
"vst1.32 {d0, d1}, [%0]! \n" // store pixels
|
||||
"vadd.s32 q8, q8, q9 \n"
|
||||
"subs %2, %2, #4 \n" // 4 processed per loop
|
||||
"bgt 1b \n"
|
||||
: "+r"(dst_argb), // %0
|
||||
"+r"(src_argb), // %1
|
||||
"+r"(dst_width), // %2
|
||||
"+r"(x), // %3
|
||||
"+r"(dx), // %4
|
||||
"+r"(tmp), // %5
|
||||
"+r"(src_tmp) // %6
|
||||
:
|
||||
: "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9",
|
||||
"q10", "q11", "q12", "q13", "q14", "q15"
|
||||
);
|
||||
}
|
||||
|
||||
#undef LOAD2_DATA32_LANE
|
||||
|
||||
#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
374
third_party/libyuv/source/scale_neon64.cc
vendored
374
third_party/libyuv/source/scale_neon64.cc
vendored
@ -27,8 +27,8 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
"1: \n"
|
||||
// load even pixels into v0, odd into v1
|
||||
MEMACCESS(0)
|
||||
"ld2 {v0.16b,v1.16b}, [%0], #32 \n"
|
||||
"subs %2, %2, #16 \n" // 16 processed per loop
|
||||
"ld2 {v0.16b,v1.16b}, [%0], #32 \n"
|
||||
"subs %w2, %w2, #16 \n" // 16 processed per loop
|
||||
MEMACCESS(1)
|
||||
"st1 {v1.16b}, [%1], #16 \n" // store odd pixels
|
||||
"b.gt 1b \n"
|
||||
@ -40,6 +40,29 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
);
|
||||
}
|
||||
|
||||
// Read 32x1 average down and write 16x1.
|
||||
void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst, int dst_width) {
|
||||
asm volatile (
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load pixels and post inc
|
||||
"subs %w2, %w2, #16 \n" // 16 processed per loop
|
||||
"uaddlp v0.8h, v0.16b \n" // add adjacent
|
||||
"uaddlp v1.8h, v1.16b \n"
|
||||
"rshrn v0.8b, v0.8h, #1 \n" // downshift, round and pack
|
||||
"rshrn2 v0.16b, v1.8h, #1 \n"
|
||||
MEMACCESS(1)
|
||||
"st1 {v0.16b}, [%1], #16 \n"
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(dst_width) // %2
|
||||
:
|
||||
: "v0", "v1" // Clobber List
|
||||
);
|
||||
}
|
||||
|
||||
// Read 32x2 average down and write 16x1.
|
||||
void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst, int dst_width) {
|
||||
@ -51,7 +74,7 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
"ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load row 1 and post inc
|
||||
MEMACCESS(1)
|
||||
"ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc
|
||||
"subs %3, %3, #16 \n" // 16 processed per loop
|
||||
"subs %w3, %w3, #16 \n" // 16 processed per loop
|
||||
"uaddlp v0.8h, v0.16b \n" // row 1 add adjacent
|
||||
"uaddlp v1.8h, v1.16b \n"
|
||||
"uadalp v0.8h, v2.16b \n" // row 2 add adjacent + row1
|
||||
@ -76,7 +99,7 @@ void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop
|
||||
MEMACCESS(1)
|
||||
"st1 {v2.8b}, [%1], #8 \n"
|
||||
"b.gt 1b \n"
|
||||
@ -103,7 +126,7 @@ asm volatile (
|
||||
"ld1 {v2.16b}, [%3], #16 \n"
|
||||
MEMACCESS(5)
|
||||
"ld1 {v3.16b}, [%4], #16 \n"
|
||||
"subs %5, %5, #4 \n"
|
||||
"subs %w5, %w5, #4 \n"
|
||||
"uaddlp v0.8h, v0.16b \n"
|
||||
"uadalp v0.8h, v1.16b \n"
|
||||
"uadalp v0.8h, v2.16b \n"
|
||||
@ -134,7 +157,7 @@ void ScaleRowDown34_NEON(const uint8* src_ptr,
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
|
||||
"subs %2, %2, #24 \n"
|
||||
"subs %w2, %w2, #24 \n"
|
||||
"orr v2.16b, v3.16b, v3.16b \n" // order v0, v1, v2
|
||||
MEMACCESS(1)
|
||||
"st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
|
||||
@ -158,7 +181,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
|
||||
MEMACCESS(3)
|
||||
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
|
||||
"subs %2, %2, #24 \n"
|
||||
"subs %w2, %w2, #24 \n"
|
||||
|
||||
// filter src line 0 with src line 1
|
||||
// expand chars to shorts to allow for room
|
||||
@ -218,7 +241,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
|
||||
MEMACCESS(3)
|
||||
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
|
||||
"subs %2, %2, #24 \n"
|
||||
"subs %w2, %w2, #24 \n"
|
||||
// average src line 0 with src line 1
|
||||
"urhadd v0.8b, v0.8b, v4.8b \n"
|
||||
"urhadd v1.8b, v1.8b, v5.8b \n"
|
||||
@ -271,7 +294,7 @@ void ScaleRowDown38_NEON(const uint8* src_ptr,
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v0.16b,v1.16b}, [%0], #32 \n"
|
||||
"subs %2, %2, #12 \n"
|
||||
"subs %w2, %w2, #12 \n"
|
||||
"tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n"
|
||||
MEMACCESS(1)
|
||||
"st1 {v2.8b}, [%1], #8 \n"
|
||||
@ -313,7 +336,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
|
||||
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
|
||||
MEMACCESS(4)
|
||||
"ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n"
|
||||
"subs %4, %4, #12 \n"
|
||||
"subs %w4, %w4, #12 \n"
|
||||
|
||||
// Shuffle the input data around to get align the data
|
||||
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
|
||||
@ -437,7 +460,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
|
||||
MEMACCESS(3)
|
||||
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
|
||||
"subs %3, %3, #12 \n"
|
||||
"subs %w3, %w3, #12 \n"
|
||||
|
||||
// Shuffle the input data around to get align the data
|
||||
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
|
||||
@ -522,20 +545,127 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
|
||||
);
|
||||
}
|
||||
|
||||
void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint16* dst_ptr, int src_width, int src_height) {
|
||||
const uint8* src_tmp = NULL;
|
||||
asm volatile (
|
||||
"1: \n"
|
||||
"mov %0, %1 \n"
|
||||
"mov w12, %w5 \n"
|
||||
"eor v2.16b, v2.16b, v2.16b \n"
|
||||
"eor v3.16b, v3.16b, v3.16b \n"
|
||||
"2: \n"
|
||||
// load 16 pixels into q0
|
||||
MEMACCESS(0)
|
||||
"ld1 {v0.16b}, [%0], %3 \n"
|
||||
"uaddw2 v3.8h, v3.8h, v0.16b \n"
|
||||
"uaddw v2.8h, v2.8h, v0.8b \n"
|
||||
"subs w12, w12, #1 \n"
|
||||
"b.gt 2b \n"
|
||||
MEMACCESS(2)
|
||||
"st1 {v2.8h, v3.8h}, [%2], #32 \n" // store pixels
|
||||
"add %1, %1, #16 \n"
|
||||
"subs %w4, %w4, #16 \n" // 16 processed per loop
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_tmp), // %0
|
||||
"+r"(src_ptr), // %1
|
||||
"+r"(dst_ptr), // %2
|
||||
"+r"(src_stride), // %3
|
||||
"+r"(src_width), // %4
|
||||
"+r"(src_height) // %5
|
||||
:
|
||||
: "memory", "cc", "w12", "v0", "v1", "v2", "v3" // Clobber List
|
||||
);
|
||||
}
|
||||
|
||||
// TODO(Yang Zhang): Investigate less load instructions for
|
||||
// the x/dx stepping
|
||||
#define LOAD2_DATA8_LANE(n) \
|
||||
"lsr %5, %3, #16 \n" \
|
||||
"add %6, %1, %5 \n" \
|
||||
"add %3, %3, %4 \n" \
|
||||
MEMACCESS(6) \
|
||||
"ld2 {v4.b, v5.b}["#n"], [%6] \n"
|
||||
|
||||
void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
|
||||
int dst_width, int x, int dx) {
|
||||
int dx_offset[4] = {0, 1, 2, 3};
|
||||
int* tmp = dx_offset;
|
||||
const uint8* src_tmp = src_ptr;
|
||||
int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning.
|
||||
int64 x64 = (int64) x;
|
||||
int64 dx64 = (int64) dx;
|
||||
asm volatile (
|
||||
"dup v0.4s, %w3 \n" // x
|
||||
"dup v1.4s, %w4 \n" // dx
|
||||
"ld1 {v2.4s}, [%5] \n" // 0 1 2 3
|
||||
"shl v3.4s, v1.4s, #2 \n" // 4 * dx
|
||||
"mul v1.4s, v1.4s, v2.4s \n"
|
||||
// x , x + 1 * dx, x + 2 * dx, x + 3 * dx
|
||||
"add v1.4s, v1.4s, v0.4s \n"
|
||||
// x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
|
||||
"add v2.4s, v1.4s, v3.4s \n"
|
||||
"shl v0.4s, v3.4s, #1 \n" // 8 * dx
|
||||
"1: \n"
|
||||
LOAD2_DATA8_LANE(0)
|
||||
LOAD2_DATA8_LANE(1)
|
||||
LOAD2_DATA8_LANE(2)
|
||||
LOAD2_DATA8_LANE(3)
|
||||
LOAD2_DATA8_LANE(4)
|
||||
LOAD2_DATA8_LANE(5)
|
||||
LOAD2_DATA8_LANE(6)
|
||||
LOAD2_DATA8_LANE(7)
|
||||
"mov v6.16b, v1.16b \n"
|
||||
"mov v7.16b, v2.16b \n"
|
||||
"uzp1 v6.8h, v6.8h, v7.8h \n"
|
||||
"ushll v4.8h, v4.8b, #0 \n"
|
||||
"ushll v5.8h, v5.8b, #0 \n"
|
||||
"ssubl v16.4s, v5.4h, v4.4h \n"
|
||||
"ssubl2 v17.4s, v5.8h, v4.8h \n"
|
||||
"ushll v7.4s, v6.4h, #0 \n"
|
||||
"ushll2 v6.4s, v6.8h, #0 \n"
|
||||
"mul v16.4s, v16.4s, v7.4s \n"
|
||||
"mul v17.4s, v17.4s, v6.4s \n"
|
||||
"shrn v6.4h, v16.4s, #16 \n"
|
||||
"shrn2 v6.8h, v17.4s, #16 \n"
|
||||
"add v4.8h, v4.8h, v6.8h \n"
|
||||
"xtn v4.8b, v4.8h \n"
|
||||
|
||||
MEMACCESS(0)
|
||||
"st1 {v4.8b}, [%0], #8 \n" // store pixels
|
||||
"add v1.4s, v1.4s, v0.4s \n"
|
||||
"add v2.4s, v2.4s, v0.4s \n"
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop
|
||||
"b.gt 1b \n"
|
||||
: "+r"(dst_ptr), // %0
|
||||
"+r"(src_ptr), // %1
|
||||
"+r"(dst_width64), // %2
|
||||
"+r"(x64), // %3
|
||||
"+r"(dx64), // %4
|
||||
"+r"(tmp), // %5
|
||||
"+r"(src_tmp) // %6
|
||||
:
|
||||
: "memory", "cc", "v0", "v1", "v2", "v3",
|
||||
"v4", "v5", "v6", "v7", "v16", "v17"
|
||||
);
|
||||
}
|
||||
|
||||
#undef LOAD2_DATA8_LANE
|
||||
|
||||
// 16x2 -> 16x1
|
||||
void ScaleFilterRows_NEON(uint8* dst_ptr,
|
||||
const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
int dst_width, int source_y_fraction) {
|
||||
int y_fraction = 256 - source_y_fraction;
|
||||
asm volatile (
|
||||
"cmp %4, #0 \n"
|
||||
"cmp %w4, #0 \n"
|
||||
"b.eq 100f \n"
|
||||
"add %2, %2, %1 \n"
|
||||
"cmp %4, #64 \n"
|
||||
"cmp %w4, #64 \n"
|
||||
"b.eq 75f \n"
|
||||
"cmp %4, #128 \n"
|
||||
"cmp %w4, #128 \n"
|
||||
"b.eq 50f \n"
|
||||
"cmp %4, #192 \n"
|
||||
"cmp %w4, #192 \n"
|
||||
"b.eq 25f \n"
|
||||
|
||||
"dup v5.8b, %w4 \n"
|
||||
@ -546,7 +676,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
|
||||
"ld1 {v0.16b}, [%1], #16 \n"
|
||||
MEMACCESS(2)
|
||||
"ld1 {v1.16b}, [%2], #16 \n"
|
||||
"subs %3, %3, #16 \n"
|
||||
"subs %w3, %w3, #16 \n"
|
||||
"umull v6.8h, v0.8b, v4.8b \n"
|
||||
"umull2 v7.8h, v0.16b, v4.16b \n"
|
||||
"umlal v6.8h, v1.8b, v5.8b \n"
|
||||
@ -564,7 +694,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
|
||||
"ld1 {v0.16b}, [%1], #16 \n"
|
||||
MEMACCESS(2)
|
||||
"ld1 {v1.16b}, [%2], #16 \n"
|
||||
"subs %3, %3, #16 \n"
|
||||
"subs %w3, %w3, #16 \n"
|
||||
"urhadd v0.16b, v0.16b, v1.16b \n"
|
||||
"urhadd v0.16b, v0.16b, v1.16b \n"
|
||||
MEMACCESS(0)
|
||||
@ -578,7 +708,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
|
||||
"ld1 {v0.16b}, [%1], #16 \n"
|
||||
MEMACCESS(2)
|
||||
"ld1 {v1.16b}, [%2], #16 \n"
|
||||
"subs %3, %3, #16 \n"
|
||||
"subs %w3, %w3, #16 \n"
|
||||
"urhadd v0.16b, v0.16b, v1.16b \n"
|
||||
MEMACCESS(0)
|
||||
"st1 {v0.16b}, [%0], #16 \n"
|
||||
@ -591,7 +721,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
|
||||
"ld1 {v1.16b}, [%1], #16 \n"
|
||||
MEMACCESS(2)
|
||||
"ld1 {v0.16b}, [%2], #16 \n"
|
||||
"subs %3, %3, #16 \n"
|
||||
"subs %w3, %w3, #16 \n"
|
||||
"urhadd v0.16b, v0.16b, v1.16b \n"
|
||||
"urhadd v0.16b, v0.16b, v1.16b \n"
|
||||
MEMACCESS(0)
|
||||
@ -603,7 +733,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
|
||||
"100: \n"
|
||||
MEMACCESS(1)
|
||||
"ld1 {v0.16b}, [%1], #16 \n"
|
||||
"subs %3, %3, #16 \n"
|
||||
"subs %w3, %w3, #16 \n"
|
||||
MEMACCESS(0)
|
||||
"st1 {v0.16b}, [%0], #16 \n"
|
||||
"b.gt 100b \n"
|
||||
@ -631,7 +761,7 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
"ld2 {v0.4s, v1.4s}, [%0], #32 \n"
|
||||
MEMACCESS (0)
|
||||
"ld2 {v2.4s, v3.4s}, [%0], #32 \n"
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop
|
||||
MEMACCESS (1)
|
||||
"st1 {v1.16b}, [%1], #16 \n" // store odd pixels
|
||||
MEMACCESS (1)
|
||||
@ -645,6 +775,33 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
);
|
||||
}
|
||||
|
||||
void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
|
||||
uint8* dst_argb, int dst_width) {
|
||||
asm volatile (
|
||||
"1: \n"
|
||||
MEMACCESS (0)
|
||||
// load 8 ARGB pixels.
|
||||
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
|
||||
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
|
||||
"uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
|
||||
"uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts.
|
||||
"rshrn v0.8b, v0.8h, #1 \n" // downshift, round and pack
|
||||
"rshrn v1.8b, v1.8h, #1 \n"
|
||||
"rshrn v2.8b, v2.8h, #1 \n"
|
||||
"rshrn v3.8b, v3.8h, #1 \n"
|
||||
MEMACCESS (1)
|
||||
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
"+r"(dst_width) // %2
|
||||
:
|
||||
: "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List
|
||||
);
|
||||
}
|
||||
|
||||
void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst, int dst_width) {
|
||||
asm volatile (
|
||||
@ -653,7 +810,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
"1: \n"
|
||||
MEMACCESS (0)
|
||||
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB pixels.
|
||||
"subs %3, %3, #8 \n" // 8 processed per loop.
|
||||
"subs %w3, %w3, #8 \n" // 8 processed per loop.
|
||||
"uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
|
||||
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
|
||||
"uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
|
||||
@ -694,21 +851,21 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
|
||||
"ld1 {v0.s}[2], [%0], %3 \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v0.s}[3], [%0], %3 \n"
|
||||
"subs %2, %2, #4 \n" // 4 pixels per loop.
|
||||
"subs %w2, %w2, #4 \n" // 4 pixels per loop.
|
||||
MEMACCESS(1)
|
||||
"st1 {v0.16b}, [%1], #16 \n"
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
"+r"(dst_width) // %2
|
||||
: "r"(static_cast<ptrdiff_t>(src_stepx * 4)) // %3
|
||||
: "r"((int64)(src_stepx * 4)) // %3
|
||||
: "memory", "cc", "v0"
|
||||
);
|
||||
}
|
||||
|
||||
// Reads 4 pixels at a time.
|
||||
// Alignment requirement: src_argb 4 byte aligned.
|
||||
// TODO, might be worth another optimization pass in future.
|
||||
// TODO(Yang Zhang): Might be worth another optimization pass in future.
|
||||
// It could be upgraded to 8 pixels at a time to start with.
|
||||
void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
|
||||
int src_stepx,
|
||||
@ -717,36 +874,36 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
|
||||
"add %1, %1, %0 \n"
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 blocks -> 2x1
|
||||
"ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 blocks -> 2x1
|
||||
MEMACCESS(1)
|
||||
"ld1 {v1.8b}, [%1], %4 \n"
|
||||
"ld1 {v1.8b}, [%1], %4 \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v2.8b}, [%0], %4 \n"
|
||||
"ld1 {v2.8b}, [%0], %4 \n"
|
||||
MEMACCESS(1)
|
||||
"ld1 {v3.8b}, [%1], %4 \n"
|
||||
"ld1 {v3.8b}, [%1], %4 \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v4.8b}, [%0], %4 \n"
|
||||
"ld1 {v4.8b}, [%0], %4 \n"
|
||||
MEMACCESS(1)
|
||||
"ld1 {v5.8b}, [%1], %4 \n"
|
||||
"ld1 {v5.8b}, [%1], %4 \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v6.8b}, [%0], %4 \n"
|
||||
"ld1 {v6.8b}, [%0], %4 \n"
|
||||
MEMACCESS(1)
|
||||
"ld1 {v7.8b}, [%1], %4 \n"
|
||||
"uaddl v0.8h, v0.8b, v1.8b \n"
|
||||
"uaddl v2.8h, v2.8b, v3.8b \n"
|
||||
"uaddl v4.8h, v4.8b, v5.8b \n"
|
||||
"uaddl v6.8h, v6.8b, v7.8b \n"
|
||||
"mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd
|
||||
"mov v0.d[1], v2.d[0] \n"
|
||||
"mov v2.d[0], v16.d[1] \n"
|
||||
"mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh
|
||||
"mov v4.d[1], v6.d[0] \n"
|
||||
"mov v6.d[0], v16.d[1] \n"
|
||||
"add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d)
|
||||
"add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h)
|
||||
"rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels.
|
||||
"rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels.
|
||||
"subs %3, %3, #4 \n" // 4 pixels per loop.
|
||||
"ld1 {v7.8b}, [%1], %4 \n"
|
||||
"uaddl v0.8h, v0.8b, v1.8b \n"
|
||||
"uaddl v2.8h, v2.8b, v3.8b \n"
|
||||
"uaddl v4.8h, v4.8b, v5.8b \n"
|
||||
"uaddl v6.8h, v6.8b, v7.8b \n"
|
||||
"mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd
|
||||
"mov v0.d[1], v2.d[0] \n"
|
||||
"mov v2.d[0], v16.d[1] \n"
|
||||
"mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh
|
||||
"mov v4.d[1], v6.d[0] \n"
|
||||
"mov v6.d[0], v16.d[1] \n"
|
||||
"add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d)
|
||||
"add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h)
|
||||
"rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels.
|
||||
"rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels.
|
||||
"subs %w3, %w3, #4 \n" // 4 pixels per loop.
|
||||
MEMACCESS(2)
|
||||
"st1 {v0.16b}, [%2], #16 \n"
|
||||
"b.gt 1b \n"
|
||||
@ -754,10 +911,129 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
|
||||
"+r"(src_stride), // %1
|
||||
"+r"(dst_argb), // %2
|
||||
"+r"(dst_width) // %3
|
||||
: "r"(src_stepx * 4) // %4
|
||||
: "r"((int64)(src_stepx * 4)) // %4
|
||||
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
|
||||
);
|
||||
}
|
||||
|
||||
// TODO(Yang Zhang): Investigate less load instructions for
|
||||
// the x/dx stepping
|
||||
#define LOAD1_DATA32_LANE(vn, n) \
|
||||
"lsr %5, %3, #16 \n" \
|
||||
"add %6, %1, %5, lsl #2 \n" \
|
||||
"add %3, %3, %4 \n" \
|
||||
MEMACCESS(6) \
|
||||
"ld1 {"#vn".s}["#n"], [%6] \n"
|
||||
|
||||
void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
|
||||
int dst_width, int x, int dx) {
|
||||
const uint8* src_tmp = src_argb;
|
||||
int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning.
|
||||
int64 x64 = (int64) x;
|
||||
int64 dx64 = (int64) dx;
|
||||
int64 tmp64 = 0;
|
||||
asm volatile (
|
||||
"1: \n"
|
||||
LOAD1_DATA32_LANE(v0, 0)
|
||||
LOAD1_DATA32_LANE(v0, 1)
|
||||
LOAD1_DATA32_LANE(v0, 2)
|
||||
LOAD1_DATA32_LANE(v0, 3)
|
||||
LOAD1_DATA32_LANE(v1, 0)
|
||||
LOAD1_DATA32_LANE(v1, 1)
|
||||
LOAD1_DATA32_LANE(v1, 2)
|
||||
LOAD1_DATA32_LANE(v1, 3)
|
||||
|
||||
MEMACCESS(0)
|
||||
"st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop
|
||||
"b.gt 1b \n"
|
||||
: "+r"(dst_argb), // %0
|
||||
"+r"(src_argb), // %1
|
||||
"+r"(dst_width64), // %2
|
||||
"+r"(x64), // %3
|
||||
"+r"(dx64), // %4
|
||||
"+r"(tmp64), // %5
|
||||
"+r"(src_tmp) // %6
|
||||
:
|
||||
: "memory", "cc", "v0", "v1"
|
||||
);
|
||||
}
|
||||
|
||||
#undef LOAD1_DATA32_LANE
|
||||
|
||||
// TODO(Yang Zhang): Investigate less load instructions for
|
||||
// the x/dx stepping
|
||||
#define LOAD2_DATA32_LANE(vn1, vn2, n) \
|
||||
"lsr %5, %3, #16 \n" \
|
||||
"add %6, %1, %5, lsl #2 \n" \
|
||||
"add %3, %3, %4 \n" \
|
||||
MEMACCESS(6) \
|
||||
"ld2 {"#vn1".s, "#vn2".s}["#n"], [%6] \n"
|
||||
|
||||
void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
|
||||
int dst_width, int x, int dx) {
|
||||
int dx_offset[4] = {0, 1, 2, 3};
|
||||
int* tmp = dx_offset;
|
||||
const uint8* src_tmp = src_argb;
|
||||
int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning.
|
||||
int64 x64 = (int64) x;
|
||||
int64 dx64 = (int64) dx;
|
||||
asm volatile (
|
||||
"dup v0.4s, %w3 \n" // x
|
||||
"dup v1.4s, %w4 \n" // dx
|
||||
"ld1 {v2.4s}, [%5] \n" // 0 1 2 3
|
||||
"shl v6.4s, v1.4s, #2 \n" // 4 * dx
|
||||
"mul v1.4s, v1.4s, v2.4s \n"
|
||||
"movi v3.16b, #0x7f \n" // 0x7F
|
||||
"movi v4.8h, #0x7f \n" // 0x7F
|
||||
// x , x + 1 * dx, x + 2 * dx, x + 3 * dx
|
||||
"add v5.4s, v1.4s, v0.4s \n"
|
||||
"1: \n"
|
||||
// d0, d1: a
|
||||
// d2, d3: b
|
||||
LOAD2_DATA32_LANE(v0, v1, 0)
|
||||
LOAD2_DATA32_LANE(v0, v1, 1)
|
||||
LOAD2_DATA32_LANE(v0, v1, 2)
|
||||
LOAD2_DATA32_LANE(v0, v1, 3)
|
||||
"shrn v2.4h, v5.4s, #9 \n"
|
||||
"and v2.8b, v2.8b, v4.8b \n"
|
||||
"dup v16.8b, v2.b[0] \n"
|
||||
"dup v17.8b, v2.b[2] \n"
|
||||
"dup v18.8b, v2.b[4] \n"
|
||||
"dup v19.8b, v2.b[6] \n"
|
||||
"ext v2.8b, v16.8b, v17.8b, #4 \n"
|
||||
"ext v17.8b, v18.8b, v19.8b, #4 \n"
|
||||
"ins v2.d[1], v17.d[0] \n" // f
|
||||
"eor v7.16b, v2.16b, v3.16b \n" // 0x7f ^ f
|
||||
"umull v16.8h, v0.8b, v7.8b \n"
|
||||
"umull2 v17.8h, v0.16b, v7.16b \n"
|
||||
"umull v18.8h, v1.8b, v2.8b \n"
|
||||
"umull2 v19.8h, v1.16b, v2.16b \n"
|
||||
"add v16.8h, v16.8h, v18.8h \n"
|
||||
"add v17.8h, v17.8h, v19.8h \n"
|
||||
"shrn v0.8b, v16.8h, #7 \n"
|
||||
"shrn2 v0.16b, v17.8h, #7 \n"
|
||||
|
||||
MEMACCESS(0)
|
||||
"st1 {v0.4s}, [%0], #16 \n" // store pixels
|
||||
"add v5.4s, v5.4s, v6.4s \n"
|
||||
"subs %w2, %w2, #4 \n" // 4 processed per loop
|
||||
"b.gt 1b \n"
|
||||
: "+r"(dst_argb), // %0
|
||||
"+r"(src_argb), // %1
|
||||
"+r"(dst_width64), // %2
|
||||
"+r"(x64), // %3
|
||||
"+r"(dx64), // %4
|
||||
"+r"(tmp), // %5
|
||||
"+r"(src_tmp) // %6
|
||||
:
|
||||
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
|
||||
"v6", "v7", "v16", "v17", "v18", "v19"
|
||||
);
|
||||
}
|
||||
|
||||
#undef LOAD2_DATA32_LANE
|
||||
|
||||
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
378
third_party/libyuv/source/scale_win.cc
vendored
378
third_party/libyuv/source/scale_win.cc
vendored
@ -9,6 +9,7 @@
|
||||
*/
|
||||
|
||||
#include "libyuv/row.h"
|
||||
#include "libyuv/scale_row.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
@ -16,7 +17,8 @@ extern "C" {
|
||||
#endif
|
||||
|
||||
// This module is for Visual C x86.
|
||||
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
|
||||
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
|
||||
defined(_MSC_VER) && !defined(__clang__)
|
||||
|
||||
// Offsets for source bytes 0 to 9
|
||||
static uvec8 kShuf0 =
|
||||
@ -93,8 +95,7 @@ static uvec16 kScaleAb2 =
|
||||
{ 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
|
||||
|
||||
// Reads 32 pixels, throws half away and writes 16 pixels.
|
||||
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
|
||||
__declspec(naked) __declspec(align(16))
|
||||
__declspec(naked)
|
||||
void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
__asm {
|
||||
@ -120,8 +121,7 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
}
|
||||
|
||||
// Blends 32x1 rectangle to 16x1.
|
||||
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
|
||||
__declspec(naked) __declspec(align(16))
|
||||
__declspec(naked)
|
||||
void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
__asm {
|
||||
@ -157,8 +157,7 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
}
|
||||
|
||||
// Blends 32x2 rectangle to 16x1.
|
||||
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
|
||||
__declspec(naked) __declspec(align(16))
|
||||
__declspec(naked)
|
||||
void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
__asm {
|
||||
@ -199,9 +198,116 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef HAS_SCALEROWDOWN2_AVX2
|
||||
// Reads 64 pixels, throws half away and writes 32 pixels.
|
||||
__declspec(naked)
|
||||
void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src_ptr
|
||||
// src_stride ignored
|
||||
mov edx, [esp + 12] // dst_ptr
|
||||
mov ecx, [esp + 16] // dst_width
|
||||
|
||||
wloop:
|
||||
vmovdqu ymm0, [eax]
|
||||
vmovdqu ymm1, [eax + 32]
|
||||
lea eax, [eax + 64]
|
||||
vpsrlw ymm0, ymm0, 8 // isolate odd pixels.
|
||||
vpsrlw ymm1, ymm1, 8
|
||||
vpackuswb ymm0, ymm0, ymm1
|
||||
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
|
||||
vmovdqu [edx], ymm0
|
||||
lea edx, [edx + 32]
|
||||
sub ecx, 32
|
||||
jg wloop
|
||||
|
||||
vzeroupper
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
// Blends 64x1 rectangle to 32x1.
|
||||
__declspec(naked)
|
||||
void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src_ptr
|
||||
// src_stride
|
||||
mov edx, [esp + 12] // dst_ptr
|
||||
mov ecx, [esp + 16] // dst_width
|
||||
|
||||
vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b
|
||||
vpsrlw ymm4, ymm4, 15
|
||||
vpackuswb ymm4, ymm4, ymm4
|
||||
vpxor ymm5, ymm5, ymm5 // constant 0
|
||||
|
||||
wloop:
|
||||
vmovdqu ymm0, [eax]
|
||||
vmovdqu ymm1, [eax + 32]
|
||||
lea eax, [eax + 64]
|
||||
|
||||
vpmaddubsw ymm0, ymm0, ymm4 // average horizontally
|
||||
vpmaddubsw ymm1, ymm1, ymm4
|
||||
vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
|
||||
vpavgw ymm1, ymm1, ymm5
|
||||
vpackuswb ymm0, ymm0, ymm1
|
||||
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
|
||||
|
||||
vmovdqu [edx], ymm0
|
||||
lea edx, [edx + 32]
|
||||
sub ecx, 32
|
||||
jg wloop
|
||||
|
||||
vzeroupper
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
// Blends 64x2 rectangle to 32x1.
|
||||
__declspec(naked)
|
||||
void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
__asm {
|
||||
push esi
|
||||
mov eax, [esp + 4 + 4] // src_ptr
|
||||
mov esi, [esp + 4 + 8] // src_stride
|
||||
mov edx, [esp + 4 + 12] // dst_ptr
|
||||
mov ecx, [esp + 4 + 16] // dst_width
|
||||
|
||||
vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b
|
||||
vpsrlw ymm4, ymm4, 15
|
||||
vpackuswb ymm4, ymm4, ymm4
|
||||
vpxor ymm5, ymm5, ymm5 // constant 0
|
||||
|
||||
wloop:
|
||||
vmovdqu ymm0, [eax] // average rows
|
||||
vmovdqu ymm1, [eax + 32]
|
||||
vpavgb ymm0, ymm0, [eax + esi]
|
||||
vpavgb ymm1, ymm1, [eax + esi + 32]
|
||||
lea eax, [eax + 64]
|
||||
|
||||
vpmaddubsw ymm0, ymm0, ymm4 // average horizontally
|
||||
vpmaddubsw ymm1, ymm1, ymm4
|
||||
vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
|
||||
vpavgw ymm1, ymm1, ymm5
|
||||
vpackuswb ymm0, ymm0, ymm1
|
||||
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
|
||||
|
||||
vmovdqu [edx], ymm0
|
||||
lea edx, [edx + 32]
|
||||
sub ecx, 32
|
||||
jg wloop
|
||||
|
||||
pop esi
|
||||
vzeroupper
|
||||
ret
|
||||
}
|
||||
}
|
||||
#endif // HAS_SCALEROWDOWN2_AVX2
|
||||
|
||||
// Point samples 32 pixels to 8 pixels.
|
||||
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
|
||||
__declspec(naked) __declspec(align(16))
|
||||
__declspec(naked)
|
||||
void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
__asm {
|
||||
@ -232,8 +338,7 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
}
|
||||
|
||||
// Blends 32x4 rectangle to 8x1.
|
||||
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
|
||||
__declspec(naked) __declspec(align(16))
|
||||
__declspec(naked)
|
||||
void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
__asm {
|
||||
@ -248,11 +353,11 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
psrlw xmm7, 8
|
||||
|
||||
wloop:
|
||||
movdqu xmm0, [eax]
|
||||
movdqu xmm0, [eax] // average rows
|
||||
movdqu xmm1, [eax + 16]
|
||||
movdqu xmm2, [eax + esi]
|
||||
movdqu xmm3, [eax + esi + 16]
|
||||
pavgb xmm0, xmm2 // average rows
|
||||
pavgb xmm0, xmm2
|
||||
pavgb xmm1, xmm3
|
||||
movdqu xmm2, [eax + esi * 2]
|
||||
movdqu xmm3, [eax + esi * 2 + 16]
|
||||
@ -291,13 +396,102 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef HAS_SCALEROWDOWN4_AVX2
|
||||
// Point samples 64 pixels to 16 pixels.
|
||||
__declspec(naked)
|
||||
void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src_ptr
|
||||
// src_stride ignored
|
||||
mov edx, [esp + 12] // dst_ptr
|
||||
mov ecx, [esp + 16] // dst_width
|
||||
vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff0000
|
||||
vpsrld ymm5, ymm5, 24
|
||||
vpslld ymm5, ymm5, 16
|
||||
|
||||
wloop:
|
||||
vmovdqu ymm0, [eax]
|
||||
vmovdqu ymm1, [eax + 32]
|
||||
lea eax, [eax + 64]
|
||||
vpand ymm0, ymm0, ymm5
|
||||
vpand ymm1, ymm1, ymm5
|
||||
vpackuswb ymm0, ymm0, ymm1
|
||||
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
|
||||
vpsrlw ymm0, ymm0, 8
|
||||
vpackuswb ymm0, ymm0, ymm0
|
||||
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
|
||||
vmovdqu [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 16
|
||||
jg wloop
|
||||
|
||||
vzeroupper
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
// Blends 64x4 rectangle to 16x1.
|
||||
__declspec(naked)
|
||||
void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
__asm {
|
||||
push esi
|
||||
push edi
|
||||
mov eax, [esp + 8 + 4] // src_ptr
|
||||
mov esi, [esp + 8 + 8] // src_stride
|
||||
mov edx, [esp + 8 + 12] // dst_ptr
|
||||
mov ecx, [esp + 8 + 16] // dst_width
|
||||
lea edi, [esi + esi * 2] // src_stride * 3
|
||||
vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0x00ff00ff
|
||||
vpsrlw ymm7, ymm7, 8
|
||||
|
||||
wloop:
|
||||
vmovdqu ymm0, [eax] // average rows
|
||||
vmovdqu ymm1, [eax + 32]
|
||||
vpavgb ymm0, ymm0, [eax + esi]
|
||||
vpavgb ymm1, ymm1, [eax + esi + 32]
|
||||
vmovdqu ymm2, [eax + esi * 2]
|
||||
vmovdqu ymm3, [eax + esi * 2 + 32]
|
||||
vpavgb ymm2, ymm2, [eax + edi]
|
||||
vpavgb ymm3, ymm3, [eax + edi + 32]
|
||||
lea eax, [eax + 64]
|
||||
vpavgb ymm0, ymm0, ymm2
|
||||
vpavgb ymm1, ymm1, ymm3
|
||||
|
||||
vpand ymm2, ymm0, ymm7 // average columns (64 to 32 pixels)
|
||||
vpand ymm3, ymm1, ymm7
|
||||
vpsrlw ymm0, ymm0, 8
|
||||
vpsrlw ymm1, ymm1, 8
|
||||
vpavgw ymm0, ymm0, ymm2
|
||||
vpavgw ymm1, ymm1, ymm3
|
||||
vpackuswb ymm0, ymm0, ymm1
|
||||
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
|
||||
|
||||
vpand ymm2, ymm0, ymm7 // average columns (32 to 16 pixels)
|
||||
vpsrlw ymm0, ymm0, 8
|
||||
vpavgw ymm0, ymm0, ymm2
|
||||
vpackuswb ymm0, ymm0, ymm0
|
||||
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
|
||||
|
||||
vmovdqu [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 16
|
||||
jg wloop
|
||||
|
||||
pop edi
|
||||
pop esi
|
||||
vzeroupper
|
||||
ret
|
||||
}
|
||||
}
|
||||
#endif // HAS_SCALEROWDOWN4_AVX2
|
||||
|
||||
// Point samples 32 pixels to 24 pixels.
|
||||
// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
|
||||
// Then shuffled to do the scaling.
|
||||
|
||||
// Note that movdqa+palign may be better than movdqu.
|
||||
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
|
||||
__declspec(naked) __declspec(align(16))
|
||||
__declspec(naked)
|
||||
void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
__asm {
|
||||
@ -344,8 +538,7 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
// xmm7 kRound34
|
||||
|
||||
// Note that movdqa+palign may be better than movdqu.
|
||||
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
|
||||
__declspec(naked) __declspec(align(16))
|
||||
__declspec(naked)
|
||||
void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
@ -402,8 +595,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
|
||||
}
|
||||
|
||||
// Note that movdqa+palign may be better than movdqu.
|
||||
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
|
||||
__declspec(naked) __declspec(align(16))
|
||||
__declspec(naked)
|
||||
void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
@ -465,7 +657,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
|
||||
// 3/8 point sampler
|
||||
|
||||
// Scale 32 pixels to 12
|
||||
__declspec(naked) __declspec(align(16))
|
||||
__declspec(naked)
|
||||
void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
__asm {
|
||||
@ -496,7 +688,7 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
}
|
||||
|
||||
// Scale 16x3 pixels to 6x1 with interpolation
|
||||
__declspec(naked) __declspec(align(16))
|
||||
__declspec(naked)
|
||||
void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
@ -561,7 +753,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
|
||||
}
|
||||
|
||||
// Scale 16x2 pixels to 6x1 with interpolation
|
||||
__declspec(naked) __declspec(align(16))
|
||||
__declspec(naked)
|
||||
void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
@ -605,76 +797,68 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
|
||||
}
|
||||
}
|
||||
|
||||
// Reads 16xN bytes and produces 16 shorts at a time.
|
||||
// TODO(fbarchard): Make this handle 4xN bytes for any width ARGB.
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint16* dst_ptr, int src_width,
|
||||
int src_height) {
|
||||
// Reads 16 bytes and accumulates to 16 shorts at a time.
|
||||
__declspec(naked)
|
||||
void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
|
||||
__asm {
|
||||
push esi
|
||||
push edi
|
||||
push ebx
|
||||
push ebp
|
||||
mov esi, [esp + 16 + 4] // src_ptr
|
||||
mov edx, [esp + 16 + 8] // src_stride
|
||||
mov edi, [esp + 16 + 12] // dst_ptr
|
||||
mov ecx, [esp + 16 + 16] // dst_width
|
||||
mov ebx, [esp + 16 + 20] // height
|
||||
pxor xmm4, xmm4
|
||||
dec ebx
|
||||
mov eax, [esp + 4] // src_ptr
|
||||
mov edx, [esp + 8] // dst_ptr
|
||||
mov ecx, [esp + 12] // src_width
|
||||
pxor xmm5, xmm5
|
||||
|
||||
// sum rows
|
||||
xloop:
|
||||
// first row
|
||||
movdqu xmm0, [esi]
|
||||
lea eax, [esi + edx]
|
||||
movdqa xmm1, xmm0
|
||||
punpcklbw xmm0, xmm4
|
||||
punpckhbw xmm1, xmm4
|
||||
lea esi, [esi + 16]
|
||||
mov ebp, ebx
|
||||
test ebp, ebp
|
||||
je ydone
|
||||
|
||||
// sum remaining rows
|
||||
yloop:
|
||||
movdqu xmm2, [eax] // read 16 pixels
|
||||
lea eax, [eax + edx] // advance to next row
|
||||
movdqa xmm3, xmm2
|
||||
punpcklbw xmm2, xmm4
|
||||
punpckhbw xmm3, xmm4
|
||||
movdqu xmm3, [eax] // read 16 bytes
|
||||
lea eax, [eax + 16]
|
||||
movdqu xmm0, [edx] // read 16 words from destination
|
||||
movdqu xmm1, [edx + 16]
|
||||
movdqa xmm2, xmm3
|
||||
punpcklbw xmm2, xmm5
|
||||
punpckhbw xmm3, xmm5
|
||||
paddusw xmm0, xmm2 // sum 16 words
|
||||
paddusw xmm1, xmm3
|
||||
sub ebp, 1
|
||||
jg yloop
|
||||
|
||||
ydone:
|
||||
movdqu [edi], xmm0
|
||||
movdqu [edi + 16], xmm1
|
||||
lea edi, [edi + 32]
|
||||
|
||||
movdqu [edx], xmm0 // write 16 words to destination
|
||||
movdqu [edx + 16], xmm1
|
||||
lea edx, [edx + 32]
|
||||
sub ecx, 16
|
||||
jg xloop
|
||||
|
||||
pop ebp
|
||||
pop ebx
|
||||
pop edi
|
||||
pop esi
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
// Bilinear column filtering. SSSE3 version.
|
||||
// TODO(fbarchard): Port to Neon
|
||||
// TODO(fbarchard): Switch the following:
|
||||
// xor ebx, ebx
|
||||
// mov bx, word ptr [esi + eax] // 2 source x0 pixels
|
||||
// To
|
||||
// movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
|
||||
// when drmemory bug fixed.
|
||||
// https://code.google.com/p/drmemory/issues/detail?id=1396
|
||||
#ifdef HAS_SCALEADDROW_AVX2
|
||||
// Reads 32 bytes and accumulates to 32 shorts at a time.
|
||||
__declspec(naked)
|
||||
void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src_ptr
|
||||
mov edx, [esp + 8] // dst_ptr
|
||||
mov ecx, [esp + 12] // src_width
|
||||
vpxor ymm5, ymm5, ymm5
|
||||
|
||||
__declspec(naked) __declspec(align(16))
|
||||
// sum rows
|
||||
xloop:
|
||||
vmovdqu ymm3, [eax] // read 32 bytes
|
||||
lea eax, [eax + 32]
|
||||
vpermq ymm3, ymm3, 0xd8 // unmutate for vpunpck
|
||||
vpunpcklbw ymm2, ymm3, ymm5
|
||||
vpunpckhbw ymm3, ymm3, ymm5
|
||||
vpaddusw ymm0, ymm2, [edx] // sum 16 words
|
||||
vpaddusw ymm1, ymm3, [edx + 32]
|
||||
vmovdqu [edx], ymm0 // write 32 words to destination
|
||||
vmovdqu [edx + 32], ymm1
|
||||
lea edx, [edx + 64]
|
||||
sub ecx, 32
|
||||
jg xloop
|
||||
|
||||
vzeroupper
|
||||
ret
|
||||
}
|
||||
}
|
||||
#endif // HAS_SCALEADDROW_AVX2
|
||||
|
||||
// Bilinear column filtering. SSSE3 version.
|
||||
__declspec(naked)
|
||||
void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
||||
int dst_width, int x, int dx) {
|
||||
__asm {
|
||||
@ -751,8 +935,7 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
||||
}
|
||||
|
||||
// Reads 16 pixels, duplicates them and writes 32 pixels.
|
||||
// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
|
||||
__declspec(naked) __declspec(align(16))
|
||||
__declspec(naked)
|
||||
void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
|
||||
int dst_width, int x, int dx) {
|
||||
__asm {
|
||||
@ -777,8 +960,7 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
|
||||
}
|
||||
|
||||
// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
|
||||
// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
|
||||
__declspec(naked) __declspec(align(16))
|
||||
__declspec(naked)
|
||||
void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
|
||||
ptrdiff_t src_stride,
|
||||
uint8* dst_argb, int dst_width) {
|
||||
@ -803,8 +985,7 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
|
||||
}
|
||||
|
||||
// Blends 8x1 rectangle to 4x1.
|
||||
// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
|
||||
__declspec(naked) __declspec(align(16))
|
||||
__declspec(naked)
|
||||
void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
|
||||
ptrdiff_t src_stride,
|
||||
uint8* dst_argb, int dst_width) {
|
||||
@ -832,8 +1013,7 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
|
||||
}
|
||||
|
||||
// Blends 8x2 rectangle to 4x1.
|
||||
// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
|
||||
__declspec(naked) __declspec(align(16))
|
||||
__declspec(naked)
|
||||
void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
|
||||
ptrdiff_t src_stride,
|
||||
uint8* dst_argb, int dst_width) {
|
||||
@ -867,8 +1047,7 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
|
||||
}
|
||||
|
||||
// Reads 4 pixels at a time.
|
||||
// Alignment requirement: dst_argb 16 byte aligned.
|
||||
__declspec(naked) __declspec(align(16))
|
||||
__declspec(naked)
|
||||
void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
|
||||
int src_stepx,
|
||||
uint8* dst_argb, int dst_width) {
|
||||
@ -904,8 +1083,7 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
|
||||
}
|
||||
|
||||
// Blends four 2x2 to 4x1.
|
||||
// Alignment requirement: dst_argb 16 byte aligned.
|
||||
__declspec(naked) __declspec(align(16))
|
||||
__declspec(naked)
|
||||
void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
|
||||
ptrdiff_t src_stride,
|
||||
int src_stepx,
|
||||
@ -953,7 +1131,7 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
|
||||
}
|
||||
|
||||
// Column scaling unfiltered. SSE2 version.
|
||||
__declspec(naked) __declspec(align(16))
|
||||
__declspec(naked)
|
||||
void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
|
||||
int dst_width, int x, int dx) {
|
||||
__asm {
|
||||
@ -1044,7 +1222,7 @@ static uvec8 kShuffleFractions = {
|
||||
0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
|
||||
};
|
||||
|
||||
__declspec(naked) __declspec(align(16))
|
||||
__declspec(naked)
|
||||
void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
|
||||
int dst_width, int x, int dx) {
|
||||
__asm {
|
||||
@ -1115,8 +1293,7 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
|
||||
}
|
||||
|
||||
// Reads 4 pixels, duplicates them and writes 8 pixels.
|
||||
// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
|
||||
__declspec(naked) __declspec(align(16))
|
||||
__declspec(naked)
|
||||
void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
|
||||
int dst_width, int x, int dx) {
|
||||
__asm {
|
||||
@ -1141,7 +1318,7 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
|
||||
}
|
||||
|
||||
// Divide num by div and return as 16.16 fixed point result.
|
||||
__declspec(naked) __declspec(align(16))
|
||||
__declspec(naked)
|
||||
int FixedDiv_X86(int num, int div) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // num
|
||||
@ -1154,7 +1331,7 @@ int FixedDiv_X86(int num, int div) {
|
||||
}
|
||||
|
||||
// Divide num by div and return as 16.16 fixed point result.
|
||||
__declspec(naked) __declspec(align(16))
|
||||
__declspec(naked)
|
||||
int FixedDiv1_X86(int num, int div) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // num
|
||||
@ -1169,8 +1346,7 @@ int FixedDiv1_X86(int num, int div) {
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
|
||||
#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
|
Loading…
x
Reference in New Issue
Block a user