Merge pull request #5810 from mshabunin:hal_interface

This commit is contained in:
Vadim Pisarevsky
2015-12-17 16:48:01 +00:00
65 changed files with 3782 additions and 3437 deletions

View File

@@ -1,6 +1,5 @@
set(the_description "The Core Functionality")
ocv_add_module(core
opencv_hal
PRIVATE_REQUIRED ${ZLIB_LIBRARIES} "${OPENCL_LIBRARIES}" "${VA_LIBRARIES}"
OPTIONAL opencv_cudev
WRAP java python)

View File

@@ -72,6 +72,7 @@
@defgroup core_cluster Clustering
@defgroup core_utils Utility and system functions and macros
@{
@defgroup core_utils_sse SSE utilities
@defgroup core_utils_neon NEON utilities
@}
@defgroup core_opengl OpenGL interoperability
@@ -80,6 +81,16 @@
@defgroup core_directx DirectX interoperability
@defgroup core_eigen Eigen support
@defgroup core_opencl OpenCL support
@defgroup core_va_intel Intel VA-API/OpenCL (CL-VA) interoperability
@defgroup core_hal Hardware Acceleration Layer
@{
@defgroup core_hal_functions Functions
@defgroup core_hal_interface Interface
@defgroup core_hal_intrin Universal intrinsics
@{
@defgroup core_hal_intrin_impl Private implementation helpers
@}
@}
@}
*/

View File

@@ -50,10 +50,10 @@
#endif
#include <climits>
#include <algorithm>
#include "opencv2/core/cvdef.h"
#include "opencv2/core/cvstd.hpp"
#include "opencv2/hal.hpp"
namespace cv
{
@@ -679,8 +679,11 @@ CV_EXPORTS void setUseIPP(bool flag);
//! @} core_utils
} // cv
#include "opencv2/hal/neon_utils.hpp"
#include "opencv2/core/neon_utils.hpp"
#endif //__OPENCV_CORE_BASE_HPP__

View File

@@ -45,6 +45,9 @@
#ifndef __OPENCV_CORE_CVDEF_H__
#define __OPENCV_CORE_CVDEF_H__
//! @addtogroup core_utils
//! @{
#if !defined _CRT_SECURE_NO_DEPRECATE && defined _MSC_VER && _MSC_VER > 1300
# define _CRT_SECURE_NO_DEPRECATE /* to avoid multiple Visual Studio warnings */
#endif
@@ -56,7 +59,265 @@
#undef abs
#undef Complex
#include "opencv2/hal/defs.h"
#if !defined _CRT_SECURE_NO_DEPRECATE && defined _MSC_VER && _MSC_VER > 1300
# define _CRT_SECURE_NO_DEPRECATE /* to avoid multiple Visual Studio warnings */
#endif
#include <limits.h>
#include "opencv2/core/hal/interface.h"
#if defined __ICL
# define CV_ICC __ICL
#elif defined __ICC
# define CV_ICC __ICC
#elif defined __ECL
# define CV_ICC __ECL
#elif defined __ECC
# define CV_ICC __ECC
#elif defined __INTEL_COMPILER
# define CV_ICC __INTEL_COMPILER
#endif
#ifndef CV_INLINE
# if defined __cplusplus
# define CV_INLINE static inline
# elif defined _MSC_VER
# define CV_INLINE __inline
# else
# define CV_INLINE static
# endif
#endif
#if defined CV_ICC && !defined CV_ENABLE_UNROLLED
# define CV_ENABLE_UNROLLED 0
#else
# define CV_ENABLE_UNROLLED 1
#endif
#ifdef __GNUC__
# define CV_DECL_ALIGNED(x) __attribute__ ((aligned (x)))
#elif defined _MSC_VER
# define CV_DECL_ALIGNED(x) __declspec(align(x))
#else
# define CV_DECL_ALIGNED(x)
#endif
/* CPU features and intrinsics support */
#define CV_CPU_NONE 0
#define CV_CPU_MMX 1
#define CV_CPU_SSE 2
#define CV_CPU_SSE2 3
#define CV_CPU_SSE3 4
#define CV_CPU_SSSE3 5
#define CV_CPU_SSE4_1 6
#define CV_CPU_SSE4_2 7
#define CV_CPU_POPCNT 8
#define CV_CPU_AVX 10
#define CV_CPU_AVX2 11
#define CV_CPU_FMA3 12
#define CV_CPU_AVX_512F 13
#define CV_CPU_AVX_512BW 14
#define CV_CPU_AVX_512CD 15
#define CV_CPU_AVX_512DQ 16
#define CV_CPU_AVX_512ER 17
#define CV_CPU_AVX_512IFMA512 18
#define CV_CPU_AVX_512PF 19
#define CV_CPU_AVX_512VBMI 20
#define CV_CPU_AVX_512VL 21
#define CV_CPU_NEON 100
// when adding to this list remember to update the following enum
#define CV_HARDWARE_MAX_FEATURE 255
/** @brief Available CPU features.
*/
enum CpuFeatures {
CPU_MMX = 1,
CPU_SSE = 2,
CPU_SSE2 = 3,
CPU_SSE3 = 4,
CPU_SSSE3 = 5,
CPU_SSE4_1 = 6,
CPU_SSE4_2 = 7,
CPU_POPCNT = 8,
CPU_AVX = 10,
CPU_AVX2 = 11,
CPU_FMA3 = 12,
CPU_AVX_512F = 13,
CPU_AVX_512BW = 14,
CPU_AVX_512CD = 15,
CPU_AVX_512DQ = 16,
CPU_AVX_512ER = 17,
CPU_AVX_512IFMA512 = 18,
CPU_AVX_512PF = 19,
CPU_AVX_512VBMI = 20,
CPU_AVX_512VL = 21,
CPU_NEON = 100
};
// do not include SSE/AVX/NEON headers for NVCC compiler
#ifndef __CUDACC__
#if defined __SSE2__ || defined _M_X64 || (defined _M_IX86_FP && _M_IX86_FP >= 2)
# include <emmintrin.h>
# define CV_MMX 1
# define CV_SSE 1
# define CV_SSE2 1
# if defined __SSE3__ || (defined _MSC_VER && _MSC_VER >= 1500)
# include <pmmintrin.h>
# define CV_SSE3 1
# endif
# if defined __SSSE3__ || (defined _MSC_VER && _MSC_VER >= 1500)
# include <tmmintrin.h>
# define CV_SSSE3 1
# endif
# if defined __SSE4_1__ || (defined _MSC_VER && _MSC_VER >= 1500)
# include <smmintrin.h>
# define CV_SSE4_1 1
# endif
# if defined __SSE4_2__ || (defined _MSC_VER && _MSC_VER >= 1500)
# include <nmmintrin.h>
# define CV_SSE4_2 1
# endif
# if defined __POPCNT__ || (defined _MSC_VER && _MSC_VER >= 1500)
# ifdef _MSC_VER
# include <nmmintrin.h>
# else
# include <popcntintrin.h>
# endif
# define CV_POPCNT 1
# endif
# if defined __AVX__ || (defined _MSC_VER && _MSC_VER >= 1600 && 0)
// MS Visual Studio 2010 (2012?) has no macro pre-defined to identify the use of /arch:AVX
// See: http://connect.microsoft.com/VisualStudio/feedback/details/605858/arch-avx-should-define-a-predefined-macro-in-x64-and-set-a-unique-value-for-m-ix86-fp-in-win32
# include <immintrin.h>
# define CV_AVX 1
# if defined(_XCR_XFEATURE_ENABLED_MASK)
# define __xgetbv() _xgetbv(_XCR_XFEATURE_ENABLED_MASK)
# else
# define __xgetbv() 0
# endif
# endif
# if defined __AVX2__ || (defined _MSC_VER && _MSC_VER >= 1800 && 0)
# include <immintrin.h>
# define CV_AVX2 1
# if defined __FMA__
# define CV_FMA3 1
# endif
# endif
#endif
#if (defined WIN32 || defined _WIN32) && defined(_M_ARM)
# include <Intrin.h>
# include "arm_neon.h"
# define CV_NEON 1
# define CPU_HAS_NEON_FEATURE (true)
#elif defined(__ARM_NEON__) || (defined (__ARM_NEON) && defined(__aarch64__))
# include <arm_neon.h>
# define CV_NEON 1
#endif
#if defined __GNUC__ && defined __arm__ && (defined __ARM_PCS_VFP || defined __ARM_VFPV3__ || defined __ARM_NEON__) && !defined __SOFTFP__
# define CV_VFP 1
#endif
#endif // __CUDACC__
#ifndef CV_POPCNT
#define CV_POPCNT 0
#endif
#ifndef CV_MMX
# define CV_MMX 0
#endif
#ifndef CV_SSE
# define CV_SSE 0
#endif
#ifndef CV_SSE2
# define CV_SSE2 0
#endif
#ifndef CV_SSE3
# define CV_SSE3 0
#endif
#ifndef CV_SSSE3
# define CV_SSSE3 0
#endif
#ifndef CV_SSE4_1
# define CV_SSE4_1 0
#endif
#ifndef CV_SSE4_2
# define CV_SSE4_2 0
#endif
#ifndef CV_AVX
# define CV_AVX 0
#endif
#ifndef CV_AVX2
# define CV_AVX2 0
#endif
#ifndef CV_FMA3
# define CV_FMA3 0
#endif
#ifndef CV_AVX_512F
# define CV_AVX_512F 0
#endif
#ifndef CV_AVX_512BW
# define CV_AVX_512BW 0
#endif
#ifndef CV_AVX_512CD
# define CV_AVX_512CD 0
#endif
#ifndef CV_AVX_512DQ
# define CV_AVX_512DQ 0
#endif
#ifndef CV_AVX_512ER
# define CV_AVX_512ER 0
#endif
#ifndef CV_AVX_512IFMA512
# define CV_AVX_512IFMA512 0
#endif
#ifndef CV_AVX_512PF
# define CV_AVX_512PF 0
#endif
#ifndef CV_AVX_512VBMI
# define CV_AVX_512VBMI 0
#endif
#ifndef CV_AVX_512VL
# define CV_AVX_512VL 0
#endif
#ifndef CV_NEON
# define CV_NEON 0
#endif
#ifndef CV_VFP
# define CV_VFP 0
#endif
/* fundamental constants */
#define CV_PI 3.1415926535897932384626433832795
#define CV_2PI 6.283185307179586476925286766559
#define CV_LOG2 0.69314718055994530941723212145818
typedef union Cv32suf
{
int i;
unsigned u;
float f;
}
Cv32suf;
typedef union Cv64suf
{
int64 i;
uint64 u;
double f;
}
Cv64suf;
#define OPENCV_ABI_COMPATIBILITY 300
@@ -169,12 +430,12 @@
#define CV_SUBMAT_FLAG (1 << CV_SUBMAT_FLAG_SHIFT)
#define CV_IS_SUBMAT(flags) ((flags) & CV_MAT_SUBMAT_FLAG)
/* Size of each channel item,
/** Size of each channel item,
0x124489 = 1000 0100 0100 0010 0010 0001 0001 ~ array of sizeof(arr_type_elem) */
#define CV_ELEM_SIZE1(type) \
((((sizeof(size_t)<<28)|0x8442211) >> CV_MAT_DEPTH(type)*4) & 15)
/* 0x3a50 = 11 10 10 01 01 00 00 ~ array of log2(sizeof(arr_type_elem)) */
/** 0x3a50 = 11 10 10 01 01 00 00 ~ array of log2(sizeof(arr_type_elem)) */
#define CV_ELEM_SIZE(type) \
(CV_MAT_CN(type) << ((((sizeof(size_t)/4+1)*16384|0x3a50) >> CV_MAT_DEPTH(type)*2) & 3))
@@ -249,4 +510,6 @@
# endif
#endif
//! @}
#endif // __OPENCV_CORE_CVDEF_H__

View File

@@ -0,0 +1,302 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Copyright (C) 2015, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __OPENCV_CORE_FAST_MATH_HPP__
#define __OPENCV_CORE_FAST_MATH_HPP__
#include "opencv2/core/cvdef.h"
//! @addtogroup core_utils
//! @{
/****************************************************************************************\
* fast math *
\****************************************************************************************/
#if defined __BORLANDC__
# include <fastmath.h>
#elif defined __cplusplus
# include <cmath>
#else
# include <math.h>
#endif
#ifdef HAVE_TEGRA_OPTIMIZATION
# include "tegra_round.hpp"
#endif
#if CV_VFP
// 1. general scheme
#define ARM_ROUND(_value, _asm_string) \
int res; \
float temp; \
asm(_asm_string : [res] "=r" (res), [temp] "=w" (temp) : [value] "w" (_value)); \
return res
// 2. version for double
#ifdef __clang__
#define ARM_ROUND_DBL(value) ARM_ROUND(value, "vcvtr.s32.f64 %[temp], %[value] \n vmov %[res], %[temp]")
#else
#define ARM_ROUND_DBL(value) ARM_ROUND(value, "vcvtr.s32.f64 %[temp], %P[value] \n vmov %[res], %[temp]")
#endif
// 3. version for float
#define ARM_ROUND_FLT(value) ARM_ROUND(value, "vcvtr.s32.f32 %[temp], %[value]\n vmov %[res], %[temp]")
#endif // CV_VFP
/** @brief Rounds floating-point number to the nearest integer
@param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the
result is not defined.
*/
CV_INLINE int
cvRound( double value )
{
#if ((defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __x86_64__ \
&& defined __SSE2__ && !defined __APPLE__)) && !defined(__CUDACC__)
__m128d t = _mm_set_sd( value );
return _mm_cvtsd_si32(t);
#elif defined _MSC_VER && defined _M_IX86
int t;
__asm
{
fld value;
fistp t;
}
return t;
#elif ((defined _MSC_VER && defined _M_ARM) || defined CV_ICC || \
defined __GNUC__) && defined HAVE_TEGRA_OPTIMIZATION
TEGRA_ROUND_DBL(value);
#elif defined CV_ICC || defined __GNUC__
# if CV_VFP
ARM_ROUND_DBL(value);
# else
return (int)lrint(value);
# endif
#else
/* it's ok if round does not comply with IEEE754 standard;
the tests should allow +/-1 difference when the tested functions use round */
return (int)(value + (value >= 0 ? 0.5 : -0.5));
#endif
}
/** @brief Rounds floating-point number to the nearest integer not larger than the original.
The function computes an integer i such that:
\f[i \le \texttt{value} < i+1\f]
@param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the
result is not defined.
*/
CV_INLINE int cvFloor( double value )
{
#if (defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__ && !defined __APPLE__)) && !defined(__CUDACC__)
__m128d t = _mm_set_sd( value );
int i = _mm_cvtsd_si32(t);
return i - _mm_movemask_pd(_mm_cmplt_sd(t, _mm_cvtsi32_sd(t,i)));
#elif defined __GNUC__
int i = (int)value;
return i - (i > value);
#else
int i = cvRound(value);
float diff = (float)(value - i);
return i - (diff < 0);
#endif
}
/** @brief Rounds floating-point number to the nearest integer not smaller than the original.
The function computes an integer i such that:
\f[i \le \texttt{value} < i+1\f]
@param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the
result is not defined.
*/
CV_INLINE int cvCeil( double value )
{
#if (defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__&& !defined __APPLE__)) && !defined(__CUDACC__)
__m128d t = _mm_set_sd( value );
int i = _mm_cvtsd_si32(t);
return i + _mm_movemask_pd(_mm_cmplt_sd(_mm_cvtsi32_sd(t,i), t));
#elif defined __GNUC__
int i = (int)value;
return i + (i < value);
#else
int i = cvRound(value);
float diff = (float)(i - value);
return i + (diff < 0);
#endif
}
/** @brief Determines if the argument is Not A Number.
@param value The input floating-point value
The function returns 1 if the argument is Not A Number (as defined by IEEE754 standard), 0
otherwise. */
CV_INLINE int cvIsNaN( double value )
{
Cv64suf ieee754;
ieee754.f = value;
return ((unsigned)(ieee754.u >> 32) & 0x7fffffff) +
((unsigned)ieee754.u != 0) > 0x7ff00000;
}
/** @brief Determines if the argument is Infinity.
@param value The input floating-point value
The function returns 1 if the argument is a plus or minus infinity (as defined by IEEE754 standard)
and 0 otherwise. */
CV_INLINE int cvIsInf( double value )
{
Cv64suf ieee754;
ieee754.f = value;
return ((unsigned)(ieee754.u >> 32) & 0x7fffffff) == 0x7ff00000 &&
(unsigned)ieee754.u == 0;
}
#ifdef __cplusplus
/** @overload */
CV_INLINE int cvRound(float value)
{
#if ((defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __x86_64__ && \
defined __SSE2__ && !defined __APPLE__)) && !defined(__CUDACC__)
__m128 t = _mm_set_ss( value );
return _mm_cvtss_si32(t);
#elif defined _MSC_VER && defined _M_IX86
int t;
__asm
{
fld value;
fistp t;
}
return t;
#elif ((defined _MSC_VER && defined _M_ARM) || defined CV_ICC || \
defined __GNUC__) && defined HAVE_TEGRA_OPTIMIZATION
TEGRA_ROUND_FLT(value);
#elif defined CV_ICC || defined __GNUC__
# if CV_VFP
ARM_ROUND_FLT(value);
# else
return (int)lrintf(value);
# endif
#else
/* it's ok if round does not comply with IEEE754 standard;
the tests should allow +/-1 difference when the tested functions use round */
return (int)(value + (value >= 0 ? 0.5f : -0.5f));
#endif
}
/** @overload */
CV_INLINE int cvRound( int value )
{
return value;
}
/** @overload */
CV_INLINE int cvFloor( float value )
{
#if (defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__ && !defined __APPLE__)) && !defined(__CUDACC__)
__m128 t = _mm_set_ss( value );
int i = _mm_cvtss_si32(t);
return i - _mm_movemask_ps(_mm_cmplt_ss(t, _mm_cvtsi32_ss(t,i)));
#elif defined __GNUC__
int i = (int)value;
return i - (i > value);
#else
int i = cvRound(value);
float diff = (float)(value - i);
return i - (diff < 0);
#endif
}
/** @overload */
CV_INLINE int cvFloor( int value )
{
return value;
}
/** @overload */
CV_INLINE int cvCeil( float value )
{
#if (defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__&& !defined __APPLE__)) && !defined(__CUDACC__)
__m128 t = _mm_set_ss( value );
int i = _mm_cvtss_si32(t);
return i + _mm_movemask_ps(_mm_cmplt_ss(_mm_cvtsi32_ss(t,i), t));
#elif defined __GNUC__
int i = (int)value;
return i + (i < value);
#else
int i = cvRound(value);
float diff = (float)(i - value);
return i + (diff < 0);
#endif
}
/** @overload */
CV_INLINE int cvCeil( int value )
{
return value;
}
/** @overload */
CV_INLINE int cvIsNaN( float value )
{
Cv32suf ieee754;
ieee754.f = value;
return (ieee754.u & 0x7fffffff) > 0x7f800000;
}
/** @overload */
CV_INLINE int cvIsInf( float value )
{
Cv32suf ieee754;
ieee754.f = value;
return (ieee754.u & 0x7fffffff) == 0x7f800000;
}
#endif // __cplusplus
//! @} core_utils
#endif

View File

@@ -0,0 +1,218 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Copyright (C) 2015, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __OPENCV_HAL_HPP__
#define __OPENCV_HAL_HPP__
#include "opencv2/core/cvdef.h"
#include "opencv2/core/hal/interface.h"
//! @cond IGNORED
#define CALL_HAL(name, fun, ...) \
int res = fun(__VA_ARGS__); \
if (res == CV_HAL_ERROR_OK) \
return; \
else if (res != CV_HAL_ERROR_NOT_IMPLEMENTED) \
CV_Error_(cv::Error::StsInternal, \
("HAL implementation " CVAUX_STR(name) " ==> " CVAUX_STR(fun) " returned %d (0x%08x)", res, res));
//! @endcond
namespace cv { namespace hal {
//! @addtogroup core_hal_functions
//! @{
CV_EXPORTS int normHamming(const uchar* a, int n);
CV_EXPORTS int normHamming(const uchar* a, const uchar* b, int n);
CV_EXPORTS int normHamming(const uchar* a, int n, int cellSize);
CV_EXPORTS int normHamming(const uchar* a, const uchar* b, int n, int cellSize);
CV_EXPORTS int LU32f(float* A, size_t astep, int m, float* b, size_t bstep, int n);
CV_EXPORTS int LU64f(double* A, size_t astep, int m, double* b, size_t bstep, int n);
CV_EXPORTS bool Cholesky32f(float* A, size_t astep, int m, float* b, size_t bstep, int n);
CV_EXPORTS bool Cholesky64f(double* A, size_t astep, int m, double* b, size_t bstep, int n);
CV_EXPORTS int normL1_(const uchar* a, const uchar* b, int n);
CV_EXPORTS float normL1_(const float* a, const float* b, int n);
CV_EXPORTS float normL2Sqr_(const float* a, const float* b, int n);
CV_EXPORTS void exp32f(const float* src, float* dst, int n);
CV_EXPORTS void exp64f(const double* src, double* dst, int n);
CV_EXPORTS void log32f(const float* src, float* dst, int n);
CV_EXPORTS void log64f(const double* src, double* dst, int n);
CV_EXPORTS void fastAtan2(const float* y, const float* x, float* dst, int n, bool angleInDegrees);
CV_EXPORTS void magnitude32f(const float* x, const float* y, float* dst, int n);
CV_EXPORTS void magnitude64f(const double* x, const double* y, double* dst, int n);
CV_EXPORTS void sqrt32f(const float* src, float* dst, int len);
CV_EXPORTS void sqrt64f(const double* src, double* dst, int len);
CV_EXPORTS void invSqrt32f(const float* src, float* dst, int len);
CV_EXPORTS void invSqrt64f(const double* src, double* dst, int len);
CV_EXPORTS void split8u(const uchar* src, uchar** dst, int len, int cn );
CV_EXPORTS void split16u(const ushort* src, ushort** dst, int len, int cn );
CV_EXPORTS void split32s(const int* src, int** dst, int len, int cn );
CV_EXPORTS void split64s(const int64* src, int64** dst, int len, int cn );
CV_EXPORTS void merge8u(const uchar** src, uchar* dst, int len, int cn );
CV_EXPORTS void merge16u(const ushort** src, ushort* dst, int len, int cn );
CV_EXPORTS void merge32s(const int** src, int* dst, int len, int cn );
CV_EXPORTS void merge64s(const int64** src, int64* dst, int len, int cn );
CV_EXPORTS void add8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
CV_EXPORTS void add8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* );
CV_EXPORTS void add16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* );
CV_EXPORTS void add16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* );
CV_EXPORTS void add32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* );
CV_EXPORTS void add32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* );
CV_EXPORTS void add64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* );
CV_EXPORTS void sub8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
CV_EXPORTS void sub8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* );
CV_EXPORTS void sub16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* );
CV_EXPORTS void sub16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* );
CV_EXPORTS void sub32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* );
CV_EXPORTS void sub32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* );
CV_EXPORTS void sub64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* );
CV_EXPORTS void max8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
CV_EXPORTS void max8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* );
CV_EXPORTS void max16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* );
CV_EXPORTS void max16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* );
CV_EXPORTS void max32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* );
CV_EXPORTS void max32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* );
CV_EXPORTS void max64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* );
CV_EXPORTS void min8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
CV_EXPORTS void min8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* );
CV_EXPORTS void min16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* );
CV_EXPORTS void min16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* );
CV_EXPORTS void min32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* );
CV_EXPORTS void min32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* );
CV_EXPORTS void min64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* );
CV_EXPORTS void absdiff8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
CV_EXPORTS void absdiff8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* );
CV_EXPORTS void absdiff16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* );
CV_EXPORTS void absdiff16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* );
CV_EXPORTS void absdiff32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* );
CV_EXPORTS void absdiff32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* );
CV_EXPORTS void absdiff64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* );
CV_EXPORTS void and8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
CV_EXPORTS void or8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
CV_EXPORTS void xor8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
CV_EXPORTS void not8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
CV_EXPORTS void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
CV_EXPORTS void cmp8s(const schar* src1, size_t step1, const schar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
CV_EXPORTS void cmp16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
CV_EXPORTS void cmp16s(const short* src1, size_t step1, const short* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
CV_EXPORTS void cmp32s(const int* src1, size_t step1, const int* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
CV_EXPORTS void cmp32f(const float* src1, size_t step1, const float* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
CV_EXPORTS void cmp64f(const double* src1, size_t step1, const double* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
CV_EXPORTS void mul8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* scale);
CV_EXPORTS void mul8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* scale);
CV_EXPORTS void mul16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* scale);
CV_EXPORTS void mul16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* scale);
CV_EXPORTS void mul32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* scale);
CV_EXPORTS void mul32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scale);
CV_EXPORTS void mul64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scale);
CV_EXPORTS void div8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* scale);
CV_EXPORTS void div8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* scale);
CV_EXPORTS void div16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* scale);
CV_EXPORTS void div16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* scale);
CV_EXPORTS void div32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* scale);
CV_EXPORTS void div32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scale);
CV_EXPORTS void div64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scale);
CV_EXPORTS void recip8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* scale);
CV_EXPORTS void recip8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* scale);
CV_EXPORTS void recip16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* scale);
CV_EXPORTS void recip16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* scale);
CV_EXPORTS void recip32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* scale);
CV_EXPORTS void recip32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scale);
CV_EXPORTS void recip64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scale);
CV_EXPORTS void addWeighted8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _scalars );
CV_EXPORTS void addWeighted8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* scalars );
CV_EXPORTS void addWeighted16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* scalars );
CV_EXPORTS void addWeighted16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* scalars );
CV_EXPORTS void addWeighted32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* scalars );
CV_EXPORTS void addWeighted32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scalars );
CV_EXPORTS void addWeighted64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scalars );
//! @} core_hal
//=============================================================================
// for binary compatibility with 3.0
//! @cond IGNORED
CV_EXPORTS int LU(float* A, size_t astep, int m, float* b, size_t bstep, int n);
CV_EXPORTS int LU(double* A, size_t astep, int m, double* b, size_t bstep, int n);
CV_EXPORTS bool Cholesky(float* A, size_t astep, int m, float* b, size_t bstep, int n);
CV_EXPORTS bool Cholesky(double* A, size_t astep, int m, double* b, size_t bstep, int n);
CV_EXPORTS void exp(const float* src, float* dst, int n);
CV_EXPORTS void exp(const double* src, double* dst, int n);
CV_EXPORTS void log(const float* src, float* dst, int n);
CV_EXPORTS void log(const double* src, double* dst, int n);
CV_EXPORTS void magnitude(const float* x, const float* y, float* dst, int n);
CV_EXPORTS void magnitude(const double* x, const double* y, double* dst, int n);
CV_EXPORTS void sqrt(const float* src, float* dst, int len);
CV_EXPORTS void sqrt(const double* src, double* dst, int len);
CV_EXPORTS void invSqrt(const float* src, float* dst, int len);
CV_EXPORTS void invSqrt(const double* src, double* dst, int len);
//! @endcond
}} //cv::hal
#endif //__OPENCV_HAL_HPP__

View File

@@ -0,0 +1,69 @@
#ifndef _HAL_INTERFACE_HPP_INCLUDED_
#define _HAL_INTERFACE_HPP_INCLUDED_
//! @addtogroup core_hal_interface
//! @{
#define CV_HAL_ERROR_OK 0
#define CV_HAL_ERROR_NOT_IMPLEMENTED 1
#define CV_HAL_ERROR_UNKNOWN -1
#define CV_HAL_CMP_EQ 0
#define CV_HAL_CMP_GT 1
#define CV_HAL_CMP_GE 2
#define CV_HAL_CMP_LT 3
#define CV_HAL_CMP_LE 4
#define CV_HAL_CMP_NE 5
#ifdef __cplusplus
#include <cstddef>
#else
#include <stddef.h>
#endif
/* primitive types */
/*
schar - signed 1 byte integer
uchar - unsigned 1 byte integer
short - signed 2 byte integer
ushort - unsigned 2 byte integer
int - signed 4 byte integer
uint - unsigned 4 byte integer
int64 - signed 8 byte integer
uint64 - unsigned 8 byte integer
*/
#if !defined _MSC_VER && !defined __BORLANDC__
# if defined __cplusplus && __cplusplus >= 201103L && !defined __APPLE__
# include <cstdint>
typedef std::uint32_t uint;
# else
# include <stdint.h>
typedef uint32_t uint;
# endif
#else
typedef unsigned uint;
#endif
typedef signed char schar;
#ifndef __IPL_H__
typedef unsigned char uchar;
typedef unsigned short ushort;
#endif
#if defined _MSC_VER || defined __BORLANDC__
typedef __int64 int64;
typedef unsigned __int64 uint64;
# define CV_BIG_INT(n) n##I64
# define CV_BIG_UINT(n) n##UI64
#else
typedef int64_t int64;
typedef uint64_t uint64;
# define CV_BIG_INT(n) n##LL
# define CV_BIG_UINT(n) n##ULL
#endif
//! @}
#endif

View File

@@ -0,0 +1,320 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Copyright (C) 2015, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __OPENCV_HAL_INTRIN_HPP__
#define __OPENCV_HAL_INTRIN_HPP__
#include <cmath>
#include <float.h>
#include <stdlib.h>
#include "opencv2/core/cvdef.h"
#define OPENCV_HAL_ADD(a, b) ((a) + (b))
#define OPENCV_HAL_AND(a, b) ((a) & (b))
#define OPENCV_HAL_NOP(a) (a)
#define OPENCV_HAL_1ST(a, b) (a)
// unlike HAL API, which is in cv::hal,
// we put intrinsics into cv namespace to make its
// access from within opencv code more accessible
namespace cv {
//! @addtogroup core_hal_intrin
//! @{
//! @cond IGNORED
template<typename _Tp> struct V_TypeTraits
{
typedef _Tp int_type;
typedef _Tp uint_type;
typedef _Tp abs_type;
typedef _Tp sum_type;
enum { delta = 0, shift = 0 };
static int_type reinterpret_int(_Tp x) { return x; }
static uint_type reinterpet_uint(_Tp x) { return x; }
static _Tp reinterpret_from_int(int_type x) { return (_Tp)x; }
};
template<> struct V_TypeTraits<uchar>
{
typedef uchar value_type;
typedef schar int_type;
typedef uchar uint_type;
typedef uchar abs_type;
typedef int sum_type;
typedef ushort w_type;
typedef unsigned q_type;
enum { delta = 128, shift = 8 };
static int_type reinterpret_int(value_type x) { return (int_type)x; }
static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
};
template<> struct V_TypeTraits<schar>
{
typedef schar value_type;
typedef schar int_type;
typedef uchar uint_type;
typedef uchar abs_type;
typedef int sum_type;
typedef short w_type;
typedef int q_type;
enum { delta = 128, shift = 8 };
static int_type reinterpret_int(value_type x) { return (int_type)x; }
static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
};
template<> struct V_TypeTraits<ushort>
{
typedef ushort value_type;
typedef short int_type;
typedef ushort uint_type;
typedef ushort abs_type;
typedef int sum_type;
typedef unsigned w_type;
typedef uchar nu_type;
enum { delta = 32768, shift = 16 };
static int_type reinterpret_int(value_type x) { return (int_type)x; }
static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
};
template<> struct V_TypeTraits<short>
{
typedef short value_type;
typedef short int_type;
typedef ushort uint_type;
typedef ushort abs_type;
typedef int sum_type;
typedef int w_type;
typedef uchar nu_type;
typedef schar n_type;
enum { delta = 128, shift = 8 };
static int_type reinterpret_int(value_type x) { return (int_type)x; }
static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
};
template<> struct V_TypeTraits<unsigned>
{
typedef unsigned value_type;
typedef int int_type;
typedef unsigned uint_type;
typedef unsigned abs_type;
typedef unsigned sum_type;
typedef uint64 w_type;
typedef ushort nu_type;
static int_type reinterpret_int(value_type x) { return (int_type)x; }
static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
};
template<> struct V_TypeTraits<int>
{
typedef int value_type;
typedef int int_type;
typedef unsigned uint_type;
typedef unsigned abs_type;
typedef int sum_type;
typedef int64 w_type;
typedef short n_type;
typedef ushort nu_type;
static int_type reinterpret_int(value_type x) { return (int_type)x; }
static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
};
template<> struct V_TypeTraits<uint64>
{
typedef uint64 value_type;
typedef int64 int_type;
typedef uint64 uint_type;
typedef uint64 abs_type;
typedef uint64 sum_type;
typedef unsigned nu_type;
static int_type reinterpret_int(value_type x) { return (int_type)x; }
static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
};
template<> struct V_TypeTraits<int64>
{
typedef int64 value_type;
typedef int64 int_type;
typedef uint64 uint_type;
typedef uint64 abs_type;
typedef int64 sum_type;
typedef int nu_type;
static int_type reinterpret_int(value_type x) { return (int_type)x; }
static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
};
template<> struct V_TypeTraits<float>
{
typedef float value_type;
typedef int int_type;
typedef unsigned uint_type;
typedef float abs_type;
typedef float sum_type;
typedef double w_type;
static int_type reinterpret_int(value_type x)
{
Cv32suf u;
u.f = x;
return u.i;
}
static uint_type reinterpet_uint(value_type x)
{
Cv32suf u;
u.f = x;
return u.u;
}
static value_type reinterpret_from_int(int_type x)
{
Cv32suf u;
u.i = x;
return u.f;
}
};
template<> struct V_TypeTraits<double>
{
typedef double value_type;
typedef int64 int_type;
typedef uint64 uint_type;
typedef double abs_type;
typedef double sum_type;
static int_type reinterpret_int(value_type x)
{
Cv64suf u;
u.f = x;
return u.i;
}
static uint_type reinterpet_uint(value_type x)
{
Cv64suf u;
u.f = x;
return u.u;
}
static value_type reinterpret_from_int(int_type x)
{
Cv64suf u;
u.i = x;
return u.f;
}
};
template <typename T> struct V_SIMD128Traits
{
enum { nlanes = 16 / sizeof(T) };
};
//! @endcond
//! @}
}
#ifdef CV_DOXYGEN
# undef CV_SSE2
# undef CV_NEON
#endif
#if CV_SSE2
#include "opencv2/core/hal/intrin_sse.hpp"
#elif CV_NEON
#include "opencv2/core/hal/intrin_neon.hpp"
#else
#include "opencv2/core/hal/intrin_cpp.hpp"
#endif
//! @addtogroup core_hal_intrin
//! @{
#ifndef CV_SIMD128
//! Set to 1 if current compiler supports vector extensions (NEON or SSE is enabled)
#define CV_SIMD128 0
#endif
#ifndef CV_SIMD128_64F
//! Set to 1 if current intrinsics implementation supports 64-bit float vectors
#define CV_SIMD128_64F 0
#endif
//! @}
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,864 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Copyright (C) 2015, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __OPENCV_HAL_INTRIN_NEON_HPP__
#define __OPENCV_HAL_INTRIN_NEON_HPP__
#include <algorithm>
namespace cv
{
//! @cond IGNORED
#define CV_SIMD128 1
struct v_uint8x16
{
typedef uchar lane_type;
enum { nlanes = 16 };
v_uint8x16() {}
explicit v_uint8x16(uint8x16_t v) : val(v) {}
v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
{
uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
val = vld1q_u8(v);
}
uchar get0() const
{
return vgetq_lane_u8(val, 0);
}
uint8x16_t val;
};
struct v_int8x16
{
typedef schar lane_type;
enum { nlanes = 16 };
v_int8x16() {}
explicit v_int8x16(int8x16_t v) : val(v) {}
v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
{
schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
val = vld1q_s8(v);
}
schar get0() const
{
return vgetq_lane_s8(val, 0);
}
int8x16_t val;
};
struct v_uint16x8
{
typedef ushort lane_type;
enum { nlanes = 8 };
v_uint16x8() {}
explicit v_uint16x8(uint16x8_t v) : val(v) {}
v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
{
ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
val = vld1q_u16(v);
}
ushort get0() const
{
return vgetq_lane_u16(val, 0);
}
uint16x8_t val;
};
struct v_int16x8
{
typedef short lane_type;
enum { nlanes = 8 };
v_int16x8() {}
explicit v_int16x8(int16x8_t v) : val(v) {}
v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
{
short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
val = vld1q_s16(v);
}
short get0() const
{
return vgetq_lane_s16(val, 0);
}
int16x8_t val;
};
struct v_uint32x4
{
typedef unsigned lane_type;
enum { nlanes = 4 };
v_uint32x4() {}
explicit v_uint32x4(uint32x4_t v) : val(v) {}
v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
{
unsigned v[] = {v0, v1, v2, v3};
val = vld1q_u32(v);
}
unsigned get0() const
{
return vgetq_lane_u32(val, 0);
}
uint32x4_t val;
};
struct v_int32x4
{
typedef int lane_type;
enum { nlanes = 4 };
v_int32x4() {}
explicit v_int32x4(int32x4_t v) : val(v) {}
v_int32x4(int v0, int v1, int v2, int v3)
{
int v[] = {v0, v1, v2, v3};
val = vld1q_s32(v);
}
int get0() const
{
return vgetq_lane_s32(val, 0);
}
int32x4_t val;
};
struct v_float32x4
{
typedef float lane_type;
enum { nlanes = 4 };
v_float32x4() {}
explicit v_float32x4(float32x4_t v) : val(v) {}
v_float32x4(float v0, float v1, float v2, float v3)
{
float v[] = {v0, v1, v2, v3};
val = vld1q_f32(v);
}
float get0() const
{
return vgetq_lane_f32(val, 0);
}
float32x4_t val;
};
struct v_uint64x2
{
typedef uint64 lane_type;
enum { nlanes = 2 };
v_uint64x2() {}
explicit v_uint64x2(uint64x2_t v) : val(v) {}
v_uint64x2(unsigned v0, unsigned v1)
{
uint64 v[] = {v0, v1};
val = vld1q_u64(v);
}
uint64 get0() const
{
return vgetq_lane_u64(val, 0);
}
uint64x2_t val;
};
struct v_int64x2
{
typedef int64 lane_type;
enum { nlanes = 2 };
v_int64x2() {}
explicit v_int64x2(int64x2_t v) : val(v) {}
v_int64x2(int v0, int v1)
{
int64 v[] = {v0, v1};
val = vld1q_s64(v);
}
int64 get0() const
{
return vgetq_lane_s64(val, 0);
}
int64x2_t val;
};
#define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \
inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \
inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \
inline _Tpv##_t vreinterpretq_##suffix##_##suffix(_Tpv##_t v) { return v; } \
inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(vreinterpretq_u8_##suffix(v.val)); } \
inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(vreinterpretq_s8_##suffix(v.val)); } \
inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8(vreinterpretq_u16_##suffix(v.val)); } \
inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8(vreinterpretq_s16_##suffix(v.val)); } \
inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4(vreinterpretq_u32_##suffix(v.val)); } \
inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4(vreinterpretq_s32_##suffix(v.val)); } \
inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2(vreinterpretq_u64_##suffix(v.val)); } \
inline v_int64x2 v_reinterpret_as_s64(const v_##_Tpv& v) { return v_int64x2(vreinterpretq_s64_##suffix(v.val)); } \
inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4(vreinterpretq_f32_##suffix(v.val)); }
OPENCV_HAL_IMPL_NEON_INIT(uint8x16, uchar, u8)
OPENCV_HAL_IMPL_NEON_INIT(int8x16, schar, s8)
OPENCV_HAL_IMPL_NEON_INIT(uint16x8, ushort, u16)
OPENCV_HAL_IMPL_NEON_INIT(int16x8, short, s16)
OPENCV_HAL_IMPL_NEON_INIT(uint32x4, unsigned, u32)
OPENCV_HAL_IMPL_NEON_INIT(int32x4, int, s32)
OPENCV_HAL_IMPL_NEON_INIT(uint64x2, uint64, u64)
OPENCV_HAL_IMPL_NEON_INIT(int64x2, int64, s64)
OPENCV_HAL_IMPL_NEON_INIT(float32x4, float, f32)
#define OPENCV_HAL_IMPL_NEON_PACK(_Tpvec, _Tp, hreg, suffix, _Tpwvec, wsuffix, pack, op) \
inline _Tpvec v_##pack(const _Tpwvec& a, const _Tpwvec& b) \
{ \
hreg a1 = vqmov##op##_##wsuffix(a.val), b1 = vqmov##op##_##wsuffix(b.val); \
return _Tpvec(vcombine_##suffix(a1, b1)); \
} \
inline void v_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
{ \
hreg a1 = vqmov##op##_##wsuffix(a.val); \
vst1_##suffix(ptr, a1); \
} \
template<int n> inline \
_Tpvec v_rshr_##pack(const _Tpwvec& a, const _Tpwvec& b) \
{ \
hreg a1 = vqrshr##op##_n_##wsuffix(a.val, n); \
hreg b1 = vqrshr##op##_n_##wsuffix(b.val, n); \
return _Tpvec(vcombine_##suffix(a1, b1)); \
} \
template<int n> inline \
void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
{ \
hreg a1 = vqrshr##op##_n_##wsuffix(a.val, n); \
vst1_##suffix(ptr, a1); \
}
OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_uint16x8, u16, pack, n)
OPENCV_HAL_IMPL_NEON_PACK(v_int8x16, schar, int8x8_t, s8, v_int16x8, s16, pack, n)
OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_uint32x4, u32, pack, n)
OPENCV_HAL_IMPL_NEON_PACK(v_int16x8, short, int16x4_t, s16, v_int32x4, s32, pack, n)
OPENCV_HAL_IMPL_NEON_PACK(v_uint32x4, unsigned, uint32x2_t, u32, v_uint64x2, u64, pack, n)
OPENCV_HAL_IMPL_NEON_PACK(v_int32x4, int, int32x2_t, s32, v_int64x2, s64, pack, n)
OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_int16x8, s16, pack_u, un)
OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_int32x4, s32, pack_u, un)
inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
const v_float32x4& m1, const v_float32x4& m2,
const v_float32x4& m3)
{
float32x2_t vl = vget_low_f32(v.val), vh = vget_high_f32(v.val);
float32x4_t res = vmulq_lane_f32(m0.val, vl, 0);
res = vmlaq_lane_f32(res, m1.val, vl, 1);
res = vmlaq_lane_f32(res, m2.val, vh, 0);
res = vmlaq_lane_f32(res, m3.val, vh, 1);
return v_float32x4(res);
}
#define OPENCV_HAL_IMPL_NEON_BIN_OP(bin_op, _Tpvec, intrin) \
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
{ \
return _Tpvec(intrin(a.val, b.val)); \
} \
inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
{ \
a.val = intrin(a.val, b.val); \
return a; \
}
OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint8x16, vqaddq_u8)
OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint8x16, vqsubq_u8)
OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int8x16, vqaddq_s8)
OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int8x16, vqsubq_s8)
OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint16x8, vqaddq_u16)
OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint16x8, vqsubq_u16)
OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_uint16x8, vmulq_u16)
OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int16x8, vqaddq_s16)
OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int16x8, vqsubq_s16)
OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int16x8, vmulq_s16)
OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int32x4, vaddq_s32)
OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int32x4, vsubq_s32)
OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int32x4, vmulq_s32)
OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint32x4, vaddq_u32)
OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint32x4, vsubq_u32)
OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_uint32x4, vmulq_u32)
OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_float32x4, vaddq_f32)
OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_float32x4, vsubq_f32)
OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_float32x4, vmulq_f32)
OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int64x2, vaddq_s64)
OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int64x2, vsubq_s64)
OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint64x2, vaddq_u64)
OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint64x2, vsubq_u64)
inline v_float32x4 operator / (const v_float32x4& a, const v_float32x4& b)
{
float32x4_t reciprocal = vrecpeq_f32(b.val);
reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
return v_float32x4(vmulq_f32(a.val, reciprocal));
}
inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b)
{
float32x4_t reciprocal = vrecpeq_f32(b.val);
reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
a.val = vmulq_f32(a.val, reciprocal);
return a;
}
inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
v_int32x4& c, v_int32x4& d)
{
c.val = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
d.val = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
}
inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
v_uint32x4& c, v_uint32x4& d)
{
c.val = vmull_u16(vget_low_u16(a.val), vget_low_u16(b.val));
d.val = vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val));
}
inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
v_uint64x2& c, v_uint64x2& d)
{
c.val = vmull_u32(vget_low_u32(a.val), vget_low_u32(b.val));
d.val = vmull_u32(vget_high_u32(a.val), vget_high_u32(b.val));
}
inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
{
int32x4_t c = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
int32x4_t d = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
int32x4x2_t cd = vuzpq_s32(c, d);
return v_int32x4(vaddq_s32(cd.val[0], cd.val[1]));
}
#define OPENCV_HAL_IMPL_NEON_LOGIC_OP(_Tpvec, suffix) \
OPENCV_HAL_IMPL_NEON_BIN_OP(&, _Tpvec, vandq_##suffix) \
OPENCV_HAL_IMPL_NEON_BIN_OP(|, _Tpvec, vorrq_##suffix) \
OPENCV_HAL_IMPL_NEON_BIN_OP(^, _Tpvec, veorq_##suffix) \
inline _Tpvec operator ~ (const _Tpvec& a) \
{ \
return _Tpvec(vreinterpretq_##suffix##_u8(vmvnq_u8(vreinterpretq_u8_##suffix(a.val)))); \
}
OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint8x16, u8)
OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int8x16, s8)
OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint16x8, u16)
OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int16x8, s16)
OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint32x4, u32)
OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int32x4, s32)
OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint64x2, u64)
OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int64x2, s64)
#define OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(bin_op, intrin) \
inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
{ \
return v_float32x4(vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val)))); \
} \
inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
{ \
a.val = vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val))); \
return a; \
}
OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(&, vandq_s32)
OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(|, vorrq_s32)
OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(^, veorq_s32)
inline v_float32x4 operator ~ (const v_float32x4& a)
{
return v_float32x4(vreinterpretq_f32_s32(vmvnq_s32(vreinterpretq_s32_f32(a.val))));
}
inline v_float32x4 v_sqrt(const v_float32x4& x)
{
float32x4_t x1 = vmaxq_f32(x.val, vdupq_n_f32(FLT_MIN));
float32x4_t e = vrsqrteq_f32(x1);
e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
return v_float32x4(vmulq_f32(x.val, e));
}
inline v_float32x4 v_invsqrt(const v_float32x4& x)
{
float32x4_t e = vrsqrteq_f32(x.val);
e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e);
e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e);
return v_float32x4(e);
}
inline v_float32x4 v_abs(v_float32x4 x)
{ return v_float32x4(vabsq_f32(x.val)); }
// TODO: exp, log, sin, cos
#define OPENCV_HAL_IMPL_NEON_BIN_FUNC(_Tpvec, func, intrin) \
inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
{ \
return _Tpvec(intrin(a.val, b.val)); \
}
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_min, vminq_u8)
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_max, vmaxq_u8)
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_min, vminq_s8)
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_max, vmaxq_s8)
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_min, vminq_u16)
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_max, vmaxq_u16)
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_min, vminq_s16)
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_max, vmaxq_s16)
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_min, vminq_u32)
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_max, vmaxq_u32)
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_min, vminq_s32)
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_max, vmaxq_s32)
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_min, vminq_f32)
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_max, vmaxq_f32)
#define OPENCV_HAL_IMPL_NEON_INT_CMP_OP(_Tpvec, cast, suffix, not_suffix) \
inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(cast(vceqq_##suffix(a.val, b.val))); } \
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(cast(vmvnq_##not_suffix(vceqq_##suffix(a.val, b.val)))); } \
inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(cast(vcltq_##suffix(a.val, b.val))); } \
inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(cast(vcgtq_##suffix(a.val, b.val))); } \
inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(cast(vcleq_##suffix(a.val, b.val))); } \
inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(cast(vcgeq_##suffix(a.val, b.val))); }
OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint8x16, OPENCV_HAL_NOP, u8, u8)
OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int8x16, vreinterpretq_s8_u8, s8, u8)
OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint16x8, OPENCV_HAL_NOP, u16, u16)
OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int16x8, vreinterpretq_s16_u16, s16, u16)
OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint32x4, OPENCV_HAL_NOP, u32, u32)
OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int32x4, vreinterpretq_s32_u32, s32, u32)
OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float32x4, vreinterpretq_f32_u32, f32, u32)
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_add_wrap, vaddq_u8)
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_add_wrap, vaddq_s8)
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_add_wrap, vaddq_u16)
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_add_wrap, vaddq_s16)
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_sub_wrap, vsubq_u8)
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_sub_wrap, vsubq_s8)
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_sub_wrap, vsubq_u16)
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_sub_wrap, vsubq_s16)
// TODO: absdiff for signed integers
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_absdiff, vabdq_u8)
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_absdiff, vabdq_u16)
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_absdiff, vabdq_u32)
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_absdiff, vabdq_f32)
#define OPENCV_HAL_IMPL_NEON_BIN_FUNC2(_Tpvec, _Tpvec2, cast, func, intrin) \
inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b) \
{ \
return _Tpvec2(cast(intrin(a.val, b.val))); \
}
OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int8x16, v_uint8x16, vreinterpretq_u8_s8, v_absdiff, vabdq_s8)
OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int16x8, v_uint16x8, vreinterpretq_u16_s16, v_absdiff, vabdq_s16)
OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int32x4, v_uint32x4, vreinterpretq_u32_s32, v_absdiff, vabdq_s32)
inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b)
{
v_float32x4 x(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
return v_sqrt(x);
}
inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
{
return v_float32x4(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
}
inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
{
return v_float32x4(vmlaq_f32(c.val, a.val, b.val));
}
// trade efficiency for convenience
#define OPENCV_HAL_IMPL_NEON_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \
inline _Tpvec operator << (const _Tpvec& a, int n) \
{ return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)n))); } \
inline _Tpvec operator >> (const _Tpvec& a, int n) \
{ return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)-n))); } \
template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
{ return _Tpvec(vshlq_n_##suffix(a.val, n)); } \
template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
{ return _Tpvec(vshrq_n_##suffix(a.val, n)); } \
template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
{ return _Tpvec(vrshrq_n_##suffix(a.val, n)); }
OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint8x16, u8, schar, s8)
OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int8x16, s8, schar, s8)
OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint16x8, u16, short, s16)
OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int16x8, s16, short, s16)
OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint32x4, u32, int, s32)
OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int32x4, s32, int, s32)
OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint64x2, u64, int64, s64)
OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int64x2, s64, int64, s64)
#define OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(_Tpvec, _Tp, suffix) \
inline _Tpvec v_load(const _Tp* ptr) \
{ return _Tpvec(vld1q_##suffix(ptr)); } \
inline _Tpvec v_load_aligned(const _Tp* ptr) \
{ return _Tpvec(vld1q_##suffix(ptr)); } \
inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
{ return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr0), vld1_##suffix(ptr1))); } \
inline void v_store(_Tp* ptr, const _Tpvec& a) \
{ vst1q_##suffix(ptr, a.val); } \
inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
{ vst1q_##suffix(ptr, a.val); } \
inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
{ vst1_##suffix(ptr, vget_low_##suffix(a.val)); } \
inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
{ vst1_##suffix(ptr, vget_high_##suffix(a.val)); }
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint8x16, uchar, u8)
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int8x16, schar, s8)
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint16x8, ushort, u16)
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int16x8, short, s16)
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint32x4, unsigned, u32)
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int32x4, int, s32)
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint64x2, uint64, u64)
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int64x2, int64, s64)
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32)
#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \
inline scalartype v_reduce_##func(const _Tpvec& a) \
{ \
scalartype CV_DECL_ALIGNED(16) buf[4]; \
v_store_aligned(buf, a); \
scalartype s0 = scalar_func(buf[0], buf[1]); \
scalartype s1 = scalar_func(buf[2], buf[3]); \
return scalar_func(s0, s1); \
}
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, unsigned, sum, OPENCV_HAL_ADD)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int, sum, OPENCV_HAL_ADD)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int, max, std::max)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int, min, std::min)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float, sum, OPENCV_HAL_ADD)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float, max, std::max)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float, min, std::min)
inline int v_signmask(const v_uint8x16& a)
{
int8x8_t m0 = vcreate_s8(CV_BIG_UINT(0x0706050403020100));
uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), vcombine_s8(m0, m0));
uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v0)));
return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 8);
}
inline int v_signmask(const v_int8x16& a)
{ return v_signmask(v_reinterpret_as_u8(a)); }
inline int v_signmask(const v_uint16x8& a)
{
int16x4_t m0 = vcreate_s16(CV_BIG_UINT(0x0003000200010000));
uint16x8_t v0 = vshlq_u16(vshrq_n_u16(a.val, 15), vcombine_s16(m0, m0));
uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(v0));
return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 4);
}
inline int v_signmask(const v_int16x8& a)
{ return v_signmask(v_reinterpret_as_u16(a)); }
inline int v_signmask(const v_uint32x4& a)
{
int32x2_t m0 = vcreate_s32(CV_BIG_UINT(0x0000000100000000));
uint32x4_t v0 = vshlq_u32(vshrq_n_u32(a.val, 31), vcombine_s32(m0, m0));
uint64x2_t v1 = vpaddlq_u32(v0);
return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 2);
}
inline int v_signmask(const v_int32x4& a)
{ return v_signmask(v_reinterpret_as_u32(a)); }
inline int v_signmask(const v_float32x4& a)
{ return v_signmask(v_reinterpret_as_u32(a)); }
#define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \
inline bool v_check_all(const v_##_Tpvec& a) \
{ \
_Tpvec##_t v0 = vshrq_n_##suffix(vmvnq_##suffix(a.val), shift); \
uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) == 0; \
} \
inline bool v_check_any(const v_##_Tpvec& a) \
{ \
_Tpvec##_t v0 = vshrq_n_##suffix(a.val, shift); \
uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) != 0; \
}
OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint8x16, u8, 7)
OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint16x8, u16, 15)
OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint32x4, u32, 31)
inline bool v_check_all(const v_int8x16& a)
{ return v_check_all(v_reinterpret_as_u8(a)); }
inline bool v_check_all(const v_int16x8& a)
{ return v_check_all(v_reinterpret_as_u16(a)); }
inline bool v_check_all(const v_int32x4& a)
{ return v_check_all(v_reinterpret_as_u32(a)); }
inline bool v_check_all(const v_float32x4& a)
{ return v_check_all(v_reinterpret_as_u32(a)); }
inline bool v_check_any(const v_int8x16& a)
{ return v_check_any(v_reinterpret_as_u8(a)); }
inline bool v_check_any(const v_int16x8& a)
{ return v_check_any(v_reinterpret_as_u16(a)); }
inline bool v_check_any(const v_int32x4& a)
{ return v_check_any(v_reinterpret_as_u32(a)); }
inline bool v_check_any(const v_float32x4& a)
{ return v_check_any(v_reinterpret_as_u32(a)); }
#define OPENCV_HAL_IMPL_NEON_SELECT(_Tpvec, suffix, usuffix) \
inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
{ \
return _Tpvec(vbslq_##suffix(vreinterpretq_##usuffix##_##suffix(mask.val), a.val, b.val)); \
}
OPENCV_HAL_IMPL_NEON_SELECT(v_uint8x16, u8, u8)
OPENCV_HAL_IMPL_NEON_SELECT(v_int8x16, s8, u8)
OPENCV_HAL_IMPL_NEON_SELECT(v_uint16x8, u16, u16)
OPENCV_HAL_IMPL_NEON_SELECT(v_int16x8, s16, u16)
OPENCV_HAL_IMPL_NEON_SELECT(v_uint32x4, u32, u32)
OPENCV_HAL_IMPL_NEON_SELECT(v_int32x4, s32, u32)
OPENCV_HAL_IMPL_NEON_SELECT(v_float32x4, f32, u32)
#define OPENCV_HAL_IMPL_NEON_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix) \
inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
{ \
b0.val = vmovl_##suffix(vget_low_##suffix(a.val)); \
b1.val = vmovl_##suffix(vget_high_##suffix(a.val)); \
} \
inline _Tpwvec v_load_expand(const _Tp* ptr) \
{ \
return _Tpwvec(vmovl_##suffix(vld1_##suffix(ptr))); \
}
OPENCV_HAL_IMPL_NEON_EXPAND(v_uint8x16, v_uint16x8, uchar, u8)
OPENCV_HAL_IMPL_NEON_EXPAND(v_int8x16, v_int16x8, schar, s8)
OPENCV_HAL_IMPL_NEON_EXPAND(v_uint16x8, v_uint32x4, ushort, u16)
OPENCV_HAL_IMPL_NEON_EXPAND(v_int16x8, v_int32x4, short, s16)
OPENCV_HAL_IMPL_NEON_EXPAND(v_uint32x4, v_uint64x2, uint, u32)
OPENCV_HAL_IMPL_NEON_EXPAND(v_int32x4, v_int64x2, int, s32)
inline v_uint32x4 v_load_expand_q(const uchar* ptr)
{
uint8x8_t v0 = vcreate_u8(*(unsigned*)ptr);
uint16x4_t v1 = vget_low_u16(vmovl_u8(v0));
return v_uint32x4(vmovl_u16(v1));
}
inline v_int32x4 v_load_expand_q(const schar* ptr)
{
int8x8_t v0 = vcreate_s8(*(unsigned*)ptr);
int16x4_t v1 = vget_low_s16(vmovl_s8(v0));
return v_int32x4(vmovl_s16(v1));
}
#define OPENCV_HAL_IMPL_NEON_UNPACKS(_Tpvec, suffix) \
inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
{ \
_Tpvec##x2_t p = vzipq_##suffix(a0.val, a1.val); \
b0.val = p.val[0]; \
b1.val = p.val[1]; \
} \
inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
{ \
return v_##_Tpvec(vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val))); \
} \
inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
{ \
return v_##_Tpvec(vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val))); \
} \
inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
{ \
c.val = vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val)); \
d.val = vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val)); \
}
OPENCV_HAL_IMPL_NEON_UNPACKS(uint8x16, u8)
OPENCV_HAL_IMPL_NEON_UNPACKS(int8x16, s8)
OPENCV_HAL_IMPL_NEON_UNPACKS(uint16x8, u16)
OPENCV_HAL_IMPL_NEON_UNPACKS(int16x8, s16)
OPENCV_HAL_IMPL_NEON_UNPACKS(uint32x4, u32)
OPENCV_HAL_IMPL_NEON_UNPACKS(int32x4, s32)
OPENCV_HAL_IMPL_NEON_UNPACKS(float32x4, f32)
#define OPENCV_HAL_IMPL_NEON_EXTRACT(_Tpvec, suffix) \
template <int s> \
inline v_##_Tpvec v_extract(const v_##_Tpvec& a, const v_##_Tpvec& b) \
{ \
return v_##_Tpvec(vextq_##suffix(a.val, b.val, s)); \
}
OPENCV_HAL_IMPL_NEON_EXTRACT(uint8x16, u8)
OPENCV_HAL_IMPL_NEON_EXTRACT(int8x16, s8)
OPENCV_HAL_IMPL_NEON_EXTRACT(uint16x8, u16)
OPENCV_HAL_IMPL_NEON_EXTRACT(int16x8, s16)
OPENCV_HAL_IMPL_NEON_EXTRACT(uint32x4, u32)
OPENCV_HAL_IMPL_NEON_EXTRACT(int32x4, s32)
OPENCV_HAL_IMPL_NEON_EXTRACT(uint64x2, u64)
OPENCV_HAL_IMPL_NEON_EXTRACT(int64x2, s64)
OPENCV_HAL_IMPL_NEON_EXTRACT(float32x4, f32)
inline v_int32x4 v_round(const v_float32x4& a)
{
static const int32x4_t v_sign = vdupq_n_s32(1 << 31),
v_05 = vreinterpretq_s32_f32(vdupq_n_f32(0.5f));
int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(a.val)));
return v_int32x4(vcvtq_s32_f32(vaddq_f32(a.val, vreinterpretq_f32_s32(v_addition))));
}
inline v_int32x4 v_floor(const v_float32x4& a)
{
int32x4_t a1 = vcvtq_s32_f32(a.val);
uint32x4_t mask = vcgtq_f32(vcvtq_f32_s32(a1), a.val);
return v_int32x4(vaddq_s32(a1, vreinterpretq_s32_u32(mask)));
}
inline v_int32x4 v_ceil(const v_float32x4& a)
{
int32x4_t a1 = vcvtq_s32_f32(a.val);
uint32x4_t mask = vcgtq_f32(a.val, vcvtq_f32_s32(a1));
return v_int32x4(vsubq_s32(a1, vreinterpretq_s32_u32(mask)));
}
inline v_int32x4 v_trunc(const v_float32x4& a)
{ return v_int32x4(vcvtq_s32_f32(a.val)); }
#define OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(_Tpvec, suffix) \
inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
const v_##_Tpvec& a2, const v_##_Tpvec& a3, \
v_##_Tpvec& b0, v_##_Tpvec& b1, \
v_##_Tpvec& b2, v_##_Tpvec& b3) \
{ \
/* m00 m01 m02 m03 */ \
/* m10 m11 m12 m13 */ \
/* m20 m21 m22 m23 */ \
/* m30 m31 m32 m33 */ \
_Tpvec##x2_t t0 = vtrnq_##suffix(a0.val, a1.val); \
_Tpvec##x2_t t1 = vtrnq_##suffix(a2.val, a3.val); \
/* m00 m10 m02 m12 */ \
/* m01 m11 m03 m13 */ \
/* m20 m30 m22 m32 */ \
/* m21 m31 m23 m33 */ \
b0.val = vcombine_##suffix(vget_low_##suffix(t0.val[0]), vget_low_##suffix(t1.val[0])); \
b1.val = vcombine_##suffix(vget_low_##suffix(t0.val[1]), vget_low_##suffix(t1.val[1])); \
b2.val = vcombine_##suffix(vget_high_##suffix(t0.val[0]), vget_high_##suffix(t1.val[0])); \
b3.val = vcombine_##suffix(vget_high_##suffix(t0.val[1]), vget_high_##suffix(t1.val[1])); \
}
OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(uint32x4, u32)
OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s32)
OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f32)
#define OPENCV_HAL_IMPL_NEON_INTERLEAVED(_Tpvec, _Tp, suffix) \
inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
{ \
_Tpvec##x3_t v = vld3q_##suffix(ptr); \
a.val = v.val[0]; \
b.val = v.val[1]; \
c.val = v.val[2]; \
} \
inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
v_##_Tpvec& c, v_##_Tpvec& d) \
{ \
_Tpvec##x4_t v = vld4q_##suffix(ptr); \
a.val = v.val[0]; \
b.val = v.val[1]; \
c.val = v.val[2]; \
d.val = v.val[3]; \
} \
inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, const v_##_Tpvec& c) \
{ \
_Tpvec##x3_t v; \
v.val[0] = a.val; \
v.val[1] = b.val; \
v.val[2] = c.val; \
vst3q_##suffix(ptr, v); \
} \
inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
const v_##_Tpvec& c, const v_##_Tpvec& d) \
{ \
_Tpvec##x4_t v; \
v.val[0] = a.val; \
v.val[1] = b.val; \
v.val[2] = c.val; \
v.val[3] = d.val; \
vst4q_##suffix(ptr, v); \
}
OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint8x16, uchar, u8)
OPENCV_HAL_IMPL_NEON_INTERLEAVED(int8x16, schar, s8)
OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint16x8, ushort, u16)
OPENCV_HAL_IMPL_NEON_INTERLEAVED(int16x8, short, s16)
OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint32x4, unsigned, u32)
OPENCV_HAL_IMPL_NEON_INTERLEAVED(int32x4, int, s32)
OPENCV_HAL_IMPL_NEON_INTERLEAVED(float32x4, float, f32)
inline v_float32x4 v_cvt_f32(const v_int32x4& a)
{
return v_float32x4(vcvtq_f32_s32(a.val));
}
//! @endcond
}
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -51,6 +51,7 @@
#include "opencv2/core/cvdef.h"
#include "opencv2/core/base.hpp"
#include "opencv2/core/traits.hpp"
#include "opencv2/core/saturate.hpp"
namespace cv
{

View File

@@ -0,0 +1,128 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2015, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __OPENCV_HAL_NEON_UTILS_HPP__
#define __OPENCV_HAL_NEON_UTILS_HPP__
#include "opencv2/core/cvdef.h"
//! @addtogroup core_utils_neon
//! @{
#if CV_NEON
inline int32x2_t cv_vrnd_s32_f32(float32x2_t v)
{
static int32x2_t v_sign = vdup_n_s32(1 << 31),
v_05 = vreinterpret_s32_f32(vdup_n_f32(0.5f));
int32x2_t v_addition = vorr_s32(v_05, vand_s32(v_sign, vreinterpret_s32_f32(v)));
return vcvt_s32_f32(vadd_f32(v, vreinterpret_f32_s32(v_addition)));
}
inline int32x4_t cv_vrndq_s32_f32(float32x4_t v)
{
static int32x4_t v_sign = vdupq_n_s32(1 << 31),
v_05 = vreinterpretq_s32_f32(vdupq_n_f32(0.5f));
int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(v)));
return vcvtq_s32_f32(vaddq_f32(v, vreinterpretq_f32_s32(v_addition)));
}
inline uint32x2_t cv_vrnd_u32_f32(float32x2_t v)
{
static float32x2_t v_05 = vdup_n_f32(0.5f);
return vcvt_u32_f32(vadd_f32(v, v_05));
}
inline uint32x4_t cv_vrndq_u32_f32(float32x4_t v)
{
static float32x4_t v_05 = vdupq_n_f32(0.5f);
return vcvtq_u32_f32(vaddq_f32(v, v_05));
}
inline float32x4_t cv_vrecpq_f32(float32x4_t val)
{
float32x4_t reciprocal = vrecpeq_f32(val);
reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
return reciprocal;
}
inline float32x2_t cv_vrecp_f32(float32x2_t val)
{
float32x2_t reciprocal = vrecpe_f32(val);
reciprocal = vmul_f32(vrecps_f32(val, reciprocal), reciprocal);
reciprocal = vmul_f32(vrecps_f32(val, reciprocal), reciprocal);
return reciprocal;
}
inline float32x4_t cv_vrsqrtq_f32(float32x4_t val)
{
float32x4_t e = vrsqrteq_f32(val);
e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e);
e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e);
return e;
}
inline float32x2_t cv_vrsqrt_f32(float32x2_t val)
{
float32x2_t e = vrsqrte_f32(val);
e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), val), e);
e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), val), e);
return e;
}
inline float32x4_t cv_vsqrtq_f32(float32x4_t val)
{
return cv_vrecpq_f32(cv_vrsqrtq_f32(val));
}
inline float32x2_t cv_vsqrt_f32(float32x2_t val)
{
return cv_vrecp_f32(cv_vrsqrt_f32(val));
}
#endif
//! @}
#endif // __OPENCV_HAL_NEON_UTILS_HPP__

View File

@@ -0,0 +1,150 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Copyright (C) 2014, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __OPENCV_CORE_SATURATE_HPP__
#define __OPENCV_CORE_SATURATE_HPP__
#include "opencv2/core/cvdef.h"
#include "opencv2/core/fast_math.hpp"
namespace cv
{
//! @addtogroup core_utils
//! @{
/////////////// saturate_cast (used in image & signal processing) ///////////////////
/** @brief Template function for accurate conversion from one primitive type to another.
The functions saturate_cast resemble the standard C++ cast operations, such as static_cast\<T\>()
and others. They perform an efficient and accurate conversion from one primitive type to another
(see the introduction chapter). saturate in the name means that when the input value v is out of the
range of the target type, the result is not formed just by taking low bits of the input, but instead
the value is clipped. For example:
@code
uchar a = saturate_cast<uchar>(-100); // a = 0 (UCHAR_MIN)
short b = saturate_cast<short>(33333.33333); // b = 32767 (SHRT_MAX)
@endcode
Such clipping is done when the target type is unsigned char , signed char , unsigned short or
signed short . For 32-bit integers, no clipping is done.
When the parameter is a floating-point value and the target type is an integer (8-, 16- or 32-bit),
the floating-point value is first rounded to the nearest integer and then clipped if needed (when
the target type is 8- or 16-bit).
This operation is used in the simplest or most complex image processing functions in OpenCV.
@param v Function parameter.
@sa add, subtract, multiply, divide, Mat::convertTo
*/
template<typename _Tp> static inline _Tp saturate_cast(uchar v) { return _Tp(v); }
/** @overload */
template<typename _Tp> static inline _Tp saturate_cast(schar v) { return _Tp(v); }
/** @overload */
template<typename _Tp> static inline _Tp saturate_cast(ushort v) { return _Tp(v); }
/** @overload */
template<typename _Tp> static inline _Tp saturate_cast(short v) { return _Tp(v); }
/** @overload */
template<typename _Tp> static inline _Tp saturate_cast(unsigned v) { return _Tp(v); }
/** @overload */
template<typename _Tp> static inline _Tp saturate_cast(int v) { return _Tp(v); }
/** @overload */
template<typename _Tp> static inline _Tp saturate_cast(float v) { return _Tp(v); }
/** @overload */
template<typename _Tp> static inline _Tp saturate_cast(double v) { return _Tp(v); }
/** @overload */
template<typename _Tp> static inline _Tp saturate_cast(int64 v) { return _Tp(v); }
/** @overload */
template<typename _Tp> static inline _Tp saturate_cast(uint64 v) { return _Tp(v); }
template<> inline uchar saturate_cast<uchar>(schar v) { return (uchar)std::max((int)v, 0); }
template<> inline uchar saturate_cast<uchar>(ushort v) { return (uchar)std::min((unsigned)v, (unsigned)UCHAR_MAX); }
template<> inline uchar saturate_cast<uchar>(int v) { return (uchar)((unsigned)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
template<> inline uchar saturate_cast<uchar>(short v) { return saturate_cast<uchar>((int)v); }
template<> inline uchar saturate_cast<uchar>(unsigned v) { return (uchar)std::min(v, (unsigned)UCHAR_MAX); }
template<> inline uchar saturate_cast<uchar>(float v) { int iv = cvRound(v); return saturate_cast<uchar>(iv); }
template<> inline uchar saturate_cast<uchar>(double v) { int iv = cvRound(v); return saturate_cast<uchar>(iv); }
template<> inline uchar saturate_cast<uchar>(int64 v) { return (uchar)((uint64)v <= (uint64)UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
template<> inline uchar saturate_cast<uchar>(uint64 v) { return (uchar)std::min(v, (uint64)UCHAR_MAX); }
template<> inline schar saturate_cast<schar>(uchar v) { return (schar)std::min((int)v, SCHAR_MAX); }
template<> inline schar saturate_cast<schar>(ushort v) { return (schar)std::min((unsigned)v, (unsigned)SCHAR_MAX); }
template<> inline schar saturate_cast<schar>(int v) { return (schar)((unsigned)(v-SCHAR_MIN) <= (unsigned)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); }
template<> inline schar saturate_cast<schar>(short v) { return saturate_cast<schar>((int)v); }
template<> inline schar saturate_cast<schar>(unsigned v) { return (schar)std::min(v, (unsigned)SCHAR_MAX); }
template<> inline schar saturate_cast<schar>(float v) { int iv = cvRound(v); return saturate_cast<schar>(iv); }
template<> inline schar saturate_cast<schar>(double v) { int iv = cvRound(v); return saturate_cast<schar>(iv); }
template<> inline schar saturate_cast<schar>(int64 v) { return (schar)((uint64)((int64)v-SCHAR_MIN) <= (uint64)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); }
template<> inline schar saturate_cast<schar>(uint64 v) { return (schar)std::min(v, (uint64)SCHAR_MAX); }
template<> inline ushort saturate_cast<ushort>(schar v) { return (ushort)std::max((int)v, 0); }
template<> inline ushort saturate_cast<ushort>(short v) { return (ushort)std::max((int)v, 0); }
template<> inline ushort saturate_cast<ushort>(int v) { return (ushort)((unsigned)v <= (unsigned)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
template<> inline ushort saturate_cast<ushort>(unsigned v) { return (ushort)std::min(v, (unsigned)USHRT_MAX); }
template<> inline ushort saturate_cast<ushort>(float v) { int iv = cvRound(v); return saturate_cast<ushort>(iv); }
template<> inline ushort saturate_cast<ushort>(double v) { int iv = cvRound(v); return saturate_cast<ushort>(iv); }
template<> inline ushort saturate_cast<ushort>(int64 v) { return (ushort)((uint64)v <= (uint64)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
template<> inline ushort saturate_cast<ushort>(uint64 v) { return (ushort)std::min(v, (uint64)USHRT_MAX); }
template<> inline short saturate_cast<short>(ushort v) { return (short)std::min((int)v, SHRT_MAX); }
template<> inline short saturate_cast<short>(int v) { return (short)((unsigned)(v - SHRT_MIN) <= (unsigned)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); }
template<> inline short saturate_cast<short>(unsigned v) { return (short)std::min(v, (unsigned)SHRT_MAX); }
template<> inline short saturate_cast<short>(float v) { int iv = cvRound(v); return saturate_cast<short>(iv); }
template<> inline short saturate_cast<short>(double v) { int iv = cvRound(v); return saturate_cast<short>(iv); }
template<> inline short saturate_cast<short>(int64 v) { return (short)((uint64)((int64)v - SHRT_MIN) <= (uint64)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); }
template<> inline short saturate_cast<short>(uint64 v) { return (short)std::min(v, (uint64)SHRT_MAX); }
template<> inline int saturate_cast<int>(float v) { return cvRound(v); }
template<> inline int saturate_cast<int>(double v) { return cvRound(v); }
// we intentionally do not clip negative numbers, to make -1 become 0xffffffff etc.
template<> inline unsigned saturate_cast<unsigned>(float v) { return cvRound(v); }
template<> inline unsigned saturate_cast<unsigned>(double v) { return cvRound(v); }
//! @}
} // cv
#endif // __OPENCV_CORE_SATURATE_HPP__

View File

@@ -0,0 +1,652 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2015, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __OPENCV_CORE_SSE_UTILS_HPP__
#define __OPENCV_CORE_SSE_UTILS_HPP__
#ifndef __cplusplus
# error sse_utils.hpp header must be compiled as C++
#endif
#include "opencv2/core/cvdef.h"
//! @addtogroup core_utils_sse
//! @{
#if CV_SSE2
inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
{
__m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_g0);
__m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_g0);
__m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_g1);
__m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_g1);
__m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk2);
__m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk2);
__m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk3);
__m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk3);
__m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk2);
__m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk2);
__m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk3);
__m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk3);
__m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk2);
__m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk2);
__m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk3);
__m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk3);
v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk2);
v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk2);
v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk3);
v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk3);
}
inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
__m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
{
__m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_g1);
__m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_g1);
__m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_b0);
__m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_b0);
__m128i layer1_chunk4 = _mm_unpacklo_epi8(v_g0, v_b1);
__m128i layer1_chunk5 = _mm_unpackhi_epi8(v_g0, v_b1);
__m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk3);
__m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk3);
__m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk4);
__m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk4);
__m128i layer2_chunk4 = _mm_unpacklo_epi8(layer1_chunk2, layer1_chunk5);
__m128i layer2_chunk5 = _mm_unpackhi_epi8(layer1_chunk2, layer1_chunk5);
__m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk3);
__m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk3);
__m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk4);
__m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk4);
__m128i layer3_chunk4 = _mm_unpacklo_epi8(layer2_chunk2, layer2_chunk5);
__m128i layer3_chunk5 = _mm_unpackhi_epi8(layer2_chunk2, layer2_chunk5);
__m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk3);
__m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk3);
__m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk4);
__m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk4);
__m128i layer4_chunk4 = _mm_unpacklo_epi8(layer3_chunk2, layer3_chunk5);
__m128i layer4_chunk5 = _mm_unpackhi_epi8(layer3_chunk2, layer3_chunk5);
v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk3);
v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk3);
v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk4);
v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk4);
v_b0 = _mm_unpacklo_epi8(layer4_chunk2, layer4_chunk5);
v_b1 = _mm_unpackhi_epi8(layer4_chunk2, layer4_chunk5);
}
inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
__m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
{
__m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_b0);
__m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_b0);
__m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_b1);
__m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_b1);
__m128i layer1_chunk4 = _mm_unpacklo_epi8(v_g0, v_a0);
__m128i layer1_chunk5 = _mm_unpackhi_epi8(v_g0, v_a0);
__m128i layer1_chunk6 = _mm_unpacklo_epi8(v_g1, v_a1);
__m128i layer1_chunk7 = _mm_unpackhi_epi8(v_g1, v_a1);
__m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk4);
__m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk4);
__m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk5);
__m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk5);
__m128i layer2_chunk4 = _mm_unpacklo_epi8(layer1_chunk2, layer1_chunk6);
__m128i layer2_chunk5 = _mm_unpackhi_epi8(layer1_chunk2, layer1_chunk6);
__m128i layer2_chunk6 = _mm_unpacklo_epi8(layer1_chunk3, layer1_chunk7);
__m128i layer2_chunk7 = _mm_unpackhi_epi8(layer1_chunk3, layer1_chunk7);
__m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk4);
__m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk4);
__m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk5);
__m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk5);
__m128i layer3_chunk4 = _mm_unpacklo_epi8(layer2_chunk2, layer2_chunk6);
__m128i layer3_chunk5 = _mm_unpackhi_epi8(layer2_chunk2, layer2_chunk6);
__m128i layer3_chunk6 = _mm_unpacklo_epi8(layer2_chunk3, layer2_chunk7);
__m128i layer3_chunk7 = _mm_unpackhi_epi8(layer2_chunk3, layer2_chunk7);
__m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk4);
__m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk4);
__m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk5);
__m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk5);
__m128i layer4_chunk4 = _mm_unpacklo_epi8(layer3_chunk2, layer3_chunk6);
__m128i layer4_chunk5 = _mm_unpackhi_epi8(layer3_chunk2, layer3_chunk6);
__m128i layer4_chunk6 = _mm_unpacklo_epi8(layer3_chunk3, layer3_chunk7);
__m128i layer4_chunk7 = _mm_unpackhi_epi8(layer3_chunk3, layer3_chunk7);
v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk4);
v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk4);
v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk5);
v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk5);
v_b0 = _mm_unpacklo_epi8(layer4_chunk2, layer4_chunk6);
v_b1 = _mm_unpackhi_epi8(layer4_chunk2, layer4_chunk6);
v_a0 = _mm_unpacklo_epi8(layer4_chunk3, layer4_chunk7);
v_a1 = _mm_unpackhi_epi8(layer4_chunk3, layer4_chunk7);
}
inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
{
__m128i v_mask = _mm_set1_epi16(0x00ff);
__m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
__m128i layer4_chunk2 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8));
__m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
__m128i layer4_chunk3 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8));
__m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask));
__m128i layer3_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8));
__m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask));
__m128i layer3_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8));
__m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
__m128i layer2_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8));
__m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
__m128i layer2_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8));
__m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
__m128i layer1_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8));
__m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
__m128i layer1_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8));
v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
v_g0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8));
v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
v_g1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8));
}
inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
__m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
{
__m128i v_mask = _mm_set1_epi16(0x00ff);
__m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
__m128i layer4_chunk3 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8));
__m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
__m128i layer4_chunk4 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8));
__m128i layer4_chunk2 = _mm_packus_epi16(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
__m128i layer4_chunk5 = _mm_packus_epi16(_mm_srli_epi16(v_b0, 8), _mm_srli_epi16(v_b1, 8));
__m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask));
__m128i layer3_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8));
__m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask));
__m128i layer3_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8));
__m128i layer3_chunk2 = _mm_packus_epi16(_mm_and_si128(layer4_chunk4, v_mask), _mm_and_si128(layer4_chunk5, v_mask));
__m128i layer3_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk4, 8), _mm_srli_epi16(layer4_chunk5, 8));
__m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
__m128i layer2_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8));
__m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
__m128i layer2_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8));
__m128i layer2_chunk2 = _mm_packus_epi16(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
__m128i layer2_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk4, 8), _mm_srli_epi16(layer3_chunk5, 8));
__m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
__m128i layer1_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8));
__m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
__m128i layer1_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8));
__m128i layer1_chunk2 = _mm_packus_epi16(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
__m128i layer1_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk4, 8), _mm_srli_epi16(layer2_chunk5, 8));
v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
v_g1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8));
v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
v_b0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8));
v_g0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
v_b1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk4, 8), _mm_srli_epi16(layer1_chunk5, 8));
}
inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
__m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
{
__m128i v_mask = _mm_set1_epi16(0x00ff);
__m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
__m128i layer4_chunk4 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8));
__m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
__m128i layer4_chunk5 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8));
__m128i layer4_chunk2 = _mm_packus_epi16(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
__m128i layer4_chunk6 = _mm_packus_epi16(_mm_srli_epi16(v_b0, 8), _mm_srli_epi16(v_b1, 8));
__m128i layer4_chunk3 = _mm_packus_epi16(_mm_and_si128(v_a0, v_mask), _mm_and_si128(v_a1, v_mask));
__m128i layer4_chunk7 = _mm_packus_epi16(_mm_srli_epi16(v_a0, 8), _mm_srli_epi16(v_a1, 8));
__m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask));
__m128i layer3_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8));
__m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask));
__m128i layer3_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8));
__m128i layer3_chunk2 = _mm_packus_epi16(_mm_and_si128(layer4_chunk4, v_mask), _mm_and_si128(layer4_chunk5, v_mask));
__m128i layer3_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk4, 8), _mm_srli_epi16(layer4_chunk5, 8));
__m128i layer3_chunk3 = _mm_packus_epi16(_mm_and_si128(layer4_chunk6, v_mask), _mm_and_si128(layer4_chunk7, v_mask));
__m128i layer3_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk6, 8), _mm_srli_epi16(layer4_chunk7, 8));
__m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
__m128i layer2_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8));
__m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
__m128i layer2_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8));
__m128i layer2_chunk2 = _mm_packus_epi16(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
__m128i layer2_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk4, 8), _mm_srli_epi16(layer3_chunk5, 8));
__m128i layer2_chunk3 = _mm_packus_epi16(_mm_and_si128(layer3_chunk6, v_mask), _mm_and_si128(layer3_chunk7, v_mask));
__m128i layer2_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk6, 8), _mm_srli_epi16(layer3_chunk7, 8));
__m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
__m128i layer1_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8));
__m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
__m128i layer1_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8));
__m128i layer1_chunk2 = _mm_packus_epi16(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
__m128i layer1_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk4, 8), _mm_srli_epi16(layer2_chunk5, 8));
__m128i layer1_chunk3 = _mm_packus_epi16(_mm_and_si128(layer2_chunk6, v_mask), _mm_and_si128(layer2_chunk7, v_mask));
__m128i layer1_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk6, 8), _mm_srli_epi16(layer2_chunk7, 8));
v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
v_b0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8));
v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
v_b1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8));
v_g0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
v_a0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk4, 8), _mm_srli_epi16(layer1_chunk5, 8));
v_g1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk6, v_mask), _mm_and_si128(layer1_chunk7, v_mask));
v_a1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk6, 8), _mm_srli_epi16(layer1_chunk7, 8));
}
inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
{
__m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_g0);
__m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_g0);
__m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_g1);
__m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_g1);
__m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk2);
__m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk2);
__m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk3);
__m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk3);
__m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk2);
__m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk2);
__m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk3);
__m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk3);
v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk2);
v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk2);
v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk3);
v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk3);
}
inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
__m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
{
__m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_g1);
__m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_g1);
__m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_b0);
__m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_b0);
__m128i layer1_chunk4 = _mm_unpacklo_epi16(v_g0, v_b1);
__m128i layer1_chunk5 = _mm_unpackhi_epi16(v_g0, v_b1);
__m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk3);
__m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk3);
__m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk4);
__m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk4);
__m128i layer2_chunk4 = _mm_unpacklo_epi16(layer1_chunk2, layer1_chunk5);
__m128i layer2_chunk5 = _mm_unpackhi_epi16(layer1_chunk2, layer1_chunk5);
__m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk3);
__m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk3);
__m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk4);
__m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk4);
__m128i layer3_chunk4 = _mm_unpacklo_epi16(layer2_chunk2, layer2_chunk5);
__m128i layer3_chunk5 = _mm_unpackhi_epi16(layer2_chunk2, layer2_chunk5);
v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk3);
v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk3);
v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk4);
v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk4);
v_b0 = _mm_unpacklo_epi16(layer3_chunk2, layer3_chunk5);
v_b1 = _mm_unpackhi_epi16(layer3_chunk2, layer3_chunk5);
}
inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
__m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
{
__m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_b0);
__m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_b0);
__m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_b1);
__m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_b1);
__m128i layer1_chunk4 = _mm_unpacklo_epi16(v_g0, v_a0);
__m128i layer1_chunk5 = _mm_unpackhi_epi16(v_g0, v_a0);
__m128i layer1_chunk6 = _mm_unpacklo_epi16(v_g1, v_a1);
__m128i layer1_chunk7 = _mm_unpackhi_epi16(v_g1, v_a1);
__m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk4);
__m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk4);
__m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk5);
__m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk5);
__m128i layer2_chunk4 = _mm_unpacklo_epi16(layer1_chunk2, layer1_chunk6);
__m128i layer2_chunk5 = _mm_unpackhi_epi16(layer1_chunk2, layer1_chunk6);
__m128i layer2_chunk6 = _mm_unpacklo_epi16(layer1_chunk3, layer1_chunk7);
__m128i layer2_chunk7 = _mm_unpackhi_epi16(layer1_chunk3, layer1_chunk7);
__m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk4);
__m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk4);
__m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk5);
__m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk5);
__m128i layer3_chunk4 = _mm_unpacklo_epi16(layer2_chunk2, layer2_chunk6);
__m128i layer3_chunk5 = _mm_unpackhi_epi16(layer2_chunk2, layer2_chunk6);
__m128i layer3_chunk6 = _mm_unpacklo_epi16(layer2_chunk3, layer2_chunk7);
__m128i layer3_chunk7 = _mm_unpackhi_epi16(layer2_chunk3, layer2_chunk7);
v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk4);
v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk4);
v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk5);
v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk5);
v_b0 = _mm_unpacklo_epi16(layer3_chunk2, layer3_chunk6);
v_b1 = _mm_unpackhi_epi16(layer3_chunk2, layer3_chunk6);
v_a0 = _mm_unpacklo_epi16(layer3_chunk3, layer3_chunk7);
v_a1 = _mm_unpackhi_epi16(layer3_chunk3, layer3_chunk7);
}
#if CV_SSE4_1
inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
{
__m128i v_mask = _mm_set1_epi32(0x0000ffff);
__m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
__m128i layer3_chunk2 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16));
__m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
__m128i layer3_chunk3 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16));
__m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
__m128i layer2_chunk2 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16));
__m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
__m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16));
__m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
__m128i layer1_chunk2 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16));
__m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
__m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16));
v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
v_g0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16));
v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
v_g1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16));
}
inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
__m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
{
__m128i v_mask = _mm_set1_epi32(0x0000ffff);
__m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
__m128i layer3_chunk3 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16));
__m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
__m128i layer3_chunk4 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16));
__m128i layer3_chunk2 = _mm_packus_epi32(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
__m128i layer3_chunk5 = _mm_packus_epi32(_mm_srli_epi32(v_b0, 16), _mm_srli_epi32(v_b1, 16));
__m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
__m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16));
__m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
__m128i layer2_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16));
__m128i layer2_chunk2 = _mm_packus_epi32(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
__m128i layer2_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk4, 16), _mm_srli_epi32(layer3_chunk5, 16));
__m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
__m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16));
__m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
__m128i layer1_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16));
__m128i layer1_chunk2 = _mm_packus_epi32(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
__m128i layer1_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk4, 16), _mm_srli_epi32(layer2_chunk5, 16));
v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
v_g1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16));
v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
v_b0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16));
v_g0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
v_b1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk4, 16), _mm_srli_epi32(layer1_chunk5, 16));
}
inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
__m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
{
__m128i v_mask = _mm_set1_epi32(0x0000ffff);
__m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
__m128i layer3_chunk4 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16));
__m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
__m128i layer3_chunk5 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16));
__m128i layer3_chunk2 = _mm_packus_epi32(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
__m128i layer3_chunk6 = _mm_packus_epi32(_mm_srli_epi32(v_b0, 16), _mm_srli_epi32(v_b1, 16));
__m128i layer3_chunk3 = _mm_packus_epi32(_mm_and_si128(v_a0, v_mask), _mm_and_si128(v_a1, v_mask));
__m128i layer3_chunk7 = _mm_packus_epi32(_mm_srli_epi32(v_a0, 16), _mm_srli_epi32(v_a1, 16));
__m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
__m128i layer2_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16));
__m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
__m128i layer2_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16));
__m128i layer2_chunk2 = _mm_packus_epi32(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
__m128i layer2_chunk6 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk4, 16), _mm_srli_epi32(layer3_chunk5, 16));
__m128i layer2_chunk3 = _mm_packus_epi32(_mm_and_si128(layer3_chunk6, v_mask), _mm_and_si128(layer3_chunk7, v_mask));
__m128i layer2_chunk7 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk6, 16), _mm_srli_epi32(layer3_chunk7, 16));
__m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
__m128i layer1_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16));
__m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
__m128i layer1_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16));
__m128i layer1_chunk2 = _mm_packus_epi32(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
__m128i layer1_chunk6 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk4, 16), _mm_srli_epi32(layer2_chunk5, 16));
__m128i layer1_chunk3 = _mm_packus_epi32(_mm_and_si128(layer2_chunk6, v_mask), _mm_and_si128(layer2_chunk7, v_mask));
__m128i layer1_chunk7 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk6, 16), _mm_srli_epi32(layer2_chunk7, 16));
v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
v_b0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16));
v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
v_b1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16));
v_g0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
v_a0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk4, 16), _mm_srli_epi32(layer1_chunk5, 16));
v_g1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk6, v_mask), _mm_and_si128(layer1_chunk7, v_mask));
v_a1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk6, 16), _mm_srli_epi32(layer1_chunk7, 16));
}
#endif // CV_SSE4_1
inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1)
{
__m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_g0);
__m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_g0);
__m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_g1);
__m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_g1);
__m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk2);
__m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk2);
__m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk3);
__m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk3);
v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk2);
v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk2);
v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk3);
v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk3);
}
inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0,
__m128 & v_g1, __m128 & v_b0, __m128 & v_b1)
{
__m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_g1);
__m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_g1);
__m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_b0);
__m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_b0);
__m128 layer1_chunk4 = _mm_unpacklo_ps(v_g0, v_b1);
__m128 layer1_chunk5 = _mm_unpackhi_ps(v_g0, v_b1);
__m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk3);
__m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk3);
__m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk4);
__m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk4);
__m128 layer2_chunk4 = _mm_unpacklo_ps(layer1_chunk2, layer1_chunk5);
__m128 layer2_chunk5 = _mm_unpackhi_ps(layer1_chunk2, layer1_chunk5);
v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk3);
v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk3);
v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk4);
v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk4);
v_b0 = _mm_unpacklo_ps(layer2_chunk2, layer2_chunk5);
v_b1 = _mm_unpackhi_ps(layer2_chunk2, layer2_chunk5);
}
inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1,
__m128 & v_b0, __m128 & v_b1, __m128 & v_a0, __m128 & v_a1)
{
__m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_b0);
__m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_b0);
__m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_b1);
__m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_b1);
__m128 layer1_chunk4 = _mm_unpacklo_ps(v_g0, v_a0);
__m128 layer1_chunk5 = _mm_unpackhi_ps(v_g0, v_a0);
__m128 layer1_chunk6 = _mm_unpacklo_ps(v_g1, v_a1);
__m128 layer1_chunk7 = _mm_unpackhi_ps(v_g1, v_a1);
__m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk4);
__m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk4);
__m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk5);
__m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk5);
__m128 layer2_chunk4 = _mm_unpacklo_ps(layer1_chunk2, layer1_chunk6);
__m128 layer2_chunk5 = _mm_unpackhi_ps(layer1_chunk2, layer1_chunk6);
__m128 layer2_chunk6 = _mm_unpacklo_ps(layer1_chunk3, layer1_chunk7);
__m128 layer2_chunk7 = _mm_unpackhi_ps(layer1_chunk3, layer1_chunk7);
v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk4);
v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk4);
v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk5);
v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk5);
v_b0 = _mm_unpacklo_ps(layer2_chunk2, layer2_chunk6);
v_b1 = _mm_unpackhi_ps(layer2_chunk2, layer2_chunk6);
v_a0 = _mm_unpacklo_ps(layer2_chunk3, layer2_chunk7);
v_a1 = _mm_unpackhi_ps(layer2_chunk3, layer2_chunk7);
}
inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1)
{
const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
__m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo);
__m128 layer2_chunk2 = _mm_shuffle_ps(v_r0, v_r1, mask_hi);
__m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo);
__m128 layer2_chunk3 = _mm_shuffle_ps(v_g0, v_g1, mask_hi);
__m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo);
__m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi);
__m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo);
__m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi);
v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo);
v_g0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi);
v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo);
v_g1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi);
}
inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0,
__m128 & v_g1, __m128 & v_b0, __m128 & v_b1)
{
const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
__m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo);
__m128 layer2_chunk3 = _mm_shuffle_ps(v_r0, v_r1, mask_hi);
__m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo);
__m128 layer2_chunk4 = _mm_shuffle_ps(v_g0, v_g1, mask_hi);
__m128 layer2_chunk2 = _mm_shuffle_ps(v_b0, v_b1, mask_lo);
__m128 layer2_chunk5 = _mm_shuffle_ps(v_b0, v_b1, mask_hi);
__m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo);
__m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi);
__m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo);
__m128 layer1_chunk4 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi);
__m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_lo);
__m128 layer1_chunk5 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_hi);
v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo);
v_g1 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi);
v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo);
v_b0 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi);
v_g0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_lo);
v_b1 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_hi);
}
inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1,
__m128 & v_b0, __m128 & v_b1, __m128 & v_a0, __m128 & v_a1)
{
const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
__m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo);
__m128 layer2_chunk4 = _mm_shuffle_ps(v_r0, v_r1, mask_hi);
__m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo);
__m128 layer2_chunk5 = _mm_shuffle_ps(v_g0, v_g1, mask_hi);
__m128 layer2_chunk2 = _mm_shuffle_ps(v_b0, v_b1, mask_lo);
__m128 layer2_chunk6 = _mm_shuffle_ps(v_b0, v_b1, mask_hi);
__m128 layer2_chunk3 = _mm_shuffle_ps(v_a0, v_a1, mask_lo);
__m128 layer2_chunk7 = _mm_shuffle_ps(v_a0, v_a1, mask_hi);
__m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo);
__m128 layer1_chunk4 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi);
__m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo);
__m128 layer1_chunk5 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi);
__m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_lo);
__m128 layer1_chunk6 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_hi);
__m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk6, layer2_chunk7, mask_lo);
__m128 layer1_chunk7 = _mm_shuffle_ps(layer2_chunk6, layer2_chunk7, mask_hi);
v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo);
v_b0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi);
v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo);
v_b1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi);
v_g0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_lo);
v_a0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_hi);
v_g1 = _mm_shuffle_ps(layer1_chunk6, layer1_chunk7, mask_lo);
v_a1 = _mm_shuffle_ps(layer1_chunk6, layer1_chunk7, mask_hi);
}
#endif // CV_SSE2
//! @}
#endif //__OPENCV_CORE_SSE_UTILS_HPP__

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,607 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Copyright (C) 2015, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __OPENCV_ARITHM_CORE_HPP__
#define __OPENCV_ARITHM_CORE_HPP__
#include "arithm_simd.hpp"
namespace cv {
template<typename T1, typename T2=T1, typename T3=T1> struct OpAdd
{
typedef T1 type1;
typedef T2 type2;
typedef T3 rtype;
T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(a + b); }
};
template<typename T1, typename T2=T1, typename T3=T1> struct OpSub
{
typedef T1 type1;
typedef T2 type2;
typedef T3 rtype;
T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(a - b); }
};
template<typename T1, typename T2=T1, typename T3=T1> struct OpRSub
{
typedef T1 type1;
typedef T2 type2;
typedef T3 rtype;
T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(b - a); }
};
template<typename T> struct OpMin
{
typedef T type1;
typedef T type2;
typedef T rtype;
T operator ()(const T a, const T b) const { return std::min(a, b); }
};
template<typename T> struct OpMax
{
typedef T type1;
typedef T type2;
typedef T rtype;
T operator ()(const T a, const T b) const { return std::max(a, b); }
};
template<typename T> struct OpAbsDiff
{
typedef T type1;
typedef T type2;
typedef T rtype;
T operator()(T a, T b) const { return a > b ? a - b : b - a; }
};
template<typename T> struct OpAnd
{
typedef T type1;
typedef T type2;
typedef T rtype;
T operator()( T a, T b ) const { return a & b; }
};
template<typename T> struct OpOr
{
typedef T type1;
typedef T type2;
typedef T rtype;
T operator()( T a, T b ) const { return a | b; }
};
template<typename T> struct OpXor
{
typedef T type1;
typedef T type2;
typedef T rtype;
T operator()( T a, T b ) const { return a ^ b; }
};
template<typename T> struct OpNot
{
typedef T type1;
typedef T type2;
typedef T rtype;
T operator()( T a, T ) const { return ~a; }
};
//=============================================================================
template<typename T, class Op, class VOp>
void vBinOp(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, size_t step, int width, int height)
{
#if CV_SSE2 || CV_NEON
VOp vop;
#endif
Op op;
for( ; height--; src1 = (const T *)((const uchar *)src1 + step1),
src2 = (const T *)((const uchar *)src2 + step2),
dst = (T *)((uchar *)dst + step) )
{
int x = 0;
#if CV_NEON || CV_SSE2
#if CV_AVX2
if( USE_AVX2 )
{
for( ; x <= width - 32/(int)sizeof(T); x += 32/sizeof(T) )
{
typename VLoadStore256<T>::reg_type r0 = VLoadStore256<T>::load(src1 + x);
r0 = vop(r0, VLoadStore256<T>::load(src2 + x));
VLoadStore256<T>::store(dst + x, r0);
}
}
#else
#if CV_SSE2
if( USE_SSE2 )
{
#endif // CV_SSE2
for( ; x <= width - 32/(int)sizeof(T); x += 32/sizeof(T) )
{
typename VLoadStore128<T>::reg_type r0 = VLoadStore128<T>::load(src1 + x );
typename VLoadStore128<T>::reg_type r1 = VLoadStore128<T>::load(src1 + x + 16/sizeof(T));
r0 = vop(r0, VLoadStore128<T>::load(src2 + x ));
r1 = vop(r1, VLoadStore128<T>::load(src2 + x + 16/sizeof(T)));
VLoadStore128<T>::store(dst + x , r0);
VLoadStore128<T>::store(dst + x + 16/sizeof(T), r1);
}
#if CV_SSE2
}
#endif // CV_SSE2
#endif // CV_AVX2
#endif // CV_NEON || CV_SSE2
#if CV_AVX2
// nothing
#elif CV_SSE2
if( USE_SSE2 )
{
for( ; x <= width - 8/(int)sizeof(T); x += 8/sizeof(T) )
{
typename VLoadStore64<T>::reg_type r = VLoadStore64<T>::load(src1 + x);
r = vop(r, VLoadStore64<T>::load(src2 + x));
VLoadStore64<T>::store(dst + x, r);
}
}
#endif
#if CV_ENABLE_UNROLLED
for( ; x <= width - 4; x += 4 )
{
T v0 = op(src1[x], src2[x]);
T v1 = op(src1[x+1], src2[x+1]);
dst[x] = v0; dst[x+1] = v1;
v0 = op(src1[x+2], src2[x+2]);
v1 = op(src1[x+3], src2[x+3]);
dst[x+2] = v0; dst[x+3] = v1;
}
#endif
for( ; x < width; x++ )
dst[x] = op(src1[x], src2[x]);
}
}
template<typename T, class Op, class Op32>
void vBinOp32(const T* src1, size_t step1, const T* src2, size_t step2,
T* dst, size_t step, int width, int height)
{
#if CV_SSE2 || CV_NEON
Op32 op32;
#endif
Op op;
for( ; height--; src1 = (const T *)((const uchar *)src1 + step1),
src2 = (const T *)((const uchar *)src2 + step2),
dst = (T *)((uchar *)dst + step) )
{
int x = 0;
#if CV_AVX2
if( USE_AVX2 )
{
if( (((size_t)src1|(size_t)src2|(size_t)dst)&31) == 0 )
{
for( ; x <= width - 8; x += 8 )
{
typename VLoadStore256Aligned<T>::reg_type r0 = VLoadStore256Aligned<T>::load(src1 + x);
r0 = op32(r0, VLoadStore256Aligned<T>::load(src2 + x));
VLoadStore256Aligned<T>::store(dst + x, r0);
}
}
}
#elif CV_SSE2
if( USE_SSE2 )
{
if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 )
{
for( ; x <= width - 8; x += 8 )
{
typename VLoadStore128Aligned<T>::reg_type r0 = VLoadStore128Aligned<T>::load(src1 + x );
typename VLoadStore128Aligned<T>::reg_type r1 = VLoadStore128Aligned<T>::load(src1 + x + 4);
r0 = op32(r0, VLoadStore128Aligned<T>::load(src2 + x ));
r1 = op32(r1, VLoadStore128Aligned<T>::load(src2 + x + 4));
VLoadStore128Aligned<T>::store(dst + x , r0);
VLoadStore128Aligned<T>::store(dst + x + 4, r1);
}
}
}
#endif // CV_AVX2
#if CV_NEON || CV_SSE2
#if CV_AVX2
if( USE_AVX2 )
{
for( ; x <= width - 8; x += 8 )
{
typename VLoadStore256<T>::reg_type r0 = VLoadStore256<T>::load(src1 + x);
r0 = op32(r0, VLoadStore256<T>::load(src2 + x));
VLoadStore256<T>::store(dst + x, r0);
}
}
#else
#if CV_SSE2
if( USE_SSE2 )
{
#endif // CV_SSE2
for( ; x <= width - 8; x += 8 )
{
typename VLoadStore128<T>::reg_type r0 = VLoadStore128<T>::load(src1 + x );
typename VLoadStore128<T>::reg_type r1 = VLoadStore128<T>::load(src1 + x + 4);
r0 = op32(r0, VLoadStore128<T>::load(src2 + x ));
r1 = op32(r1, VLoadStore128<T>::load(src2 + x + 4));
VLoadStore128<T>::store(dst + x , r0);
VLoadStore128<T>::store(dst + x + 4, r1);
}
#if CV_SSE2
}
#endif // CV_SSE2
#endif // CV_AVX2
#endif // CV_NEON || CV_SSE2
#if CV_ENABLE_UNROLLED
for( ; x <= width - 4; x += 4 )
{
T v0 = op(src1[x], src2[x]);
T v1 = op(src1[x+1], src2[x+1]);
dst[x] = v0; dst[x+1] = v1;
v0 = op(src1[x+2], src2[x+2]);
v1 = op(src1[x+3], src2[x+3]);
dst[x+2] = v0; dst[x+3] = v1;
}
#endif
for( ; x < width; x++ )
dst[x] = op(src1[x], src2[x]);
}
}
template<typename T, class Op, class Op64>
void vBinOp64(const T* src1, size_t step1, const T* src2, size_t step2,
T* dst, size_t step, int width, int height)
{
#if CV_SSE2
Op64 op64;
#endif
Op op;
for( ; height--; src1 = (const T *)((const uchar *)src1 + step1),
src2 = (const T *)((const uchar *)src2 + step2),
dst = (T *)((uchar *)dst + step) )
{
int x = 0;
#if CV_AVX2
if( USE_AVX2 )
{
if( (((size_t)src1|(size_t)src2|(size_t)dst)&31) == 0 )
{
for( ; x <= width - 4; x += 4 )
{
typename VLoadStore256Aligned<T>::reg_type r0 = VLoadStore256Aligned<T>::load(src1 + x);
r0 = op64(r0, VLoadStore256Aligned<T>::load(src2 + x));
VLoadStore256Aligned<T>::store(dst + x, r0);
}
}
}
#elif CV_SSE2
if( USE_SSE2 )
{
if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 )
{
for( ; x <= width - 4; x += 4 )
{
typename VLoadStore128Aligned<T>::reg_type r0 = VLoadStore128Aligned<T>::load(src1 + x );
typename VLoadStore128Aligned<T>::reg_type r1 = VLoadStore128Aligned<T>::load(src1 + x + 2);
r0 = op64(r0, VLoadStore128Aligned<T>::load(src2 + x ));
r1 = op64(r1, VLoadStore128Aligned<T>::load(src2 + x + 2));
VLoadStore128Aligned<T>::store(dst + x , r0);
VLoadStore128Aligned<T>::store(dst + x + 2, r1);
}
}
}
#endif
for( ; x <= width - 4; x += 4 )
{
T v0 = op(src1[x], src2[x]);
T v1 = op(src1[x+1], src2[x+1]);
dst[x] = v0; dst[x+1] = v1;
v0 = op(src1[x+2], src2[x+2]);
v1 = op(src1[x+3], src2[x+3]);
dst[x+2] = v0; dst[x+3] = v1;
}
for( ; x < width; x++ )
dst[x] = op(src1[x], src2[x]);
}
}
template<typename T> static void
cmp_(const T* src1, size_t step1, const T* src2, size_t step2,
uchar* dst, size_t step, int width, int height, int code)
{
step1 /= sizeof(src1[0]);
step2 /= sizeof(src2[0]);
if( code == CMP_GE || code == CMP_LT )
{
std::swap(src1, src2);
std::swap(step1, step2);
code = code == CMP_GE ? CMP_LE : CMP_GT;
}
Cmp_SIMD<T> vop(code);
if( code == CMP_GT || code == CMP_LE )
{
int m = code == CMP_GT ? 0 : 255;
for( ; height--; src1 += step1, src2 += step2, dst += step )
{
int x = vop(src1, src2, dst, width);
#if CV_ENABLE_UNROLLED
for( ; x <= width - 4; x += 4 )
{
int t0, t1;
t0 = -(src1[x] > src2[x]) ^ m;
t1 = -(src1[x+1] > src2[x+1]) ^ m;
dst[x] = (uchar)t0; dst[x+1] = (uchar)t1;
t0 = -(src1[x+2] > src2[x+2]) ^ m;
t1 = -(src1[x+3] > src2[x+3]) ^ m;
dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1;
}
#endif
for( ; x < width; x++ )
dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m);
}
}
else if( code == CMP_EQ || code == CMP_NE )
{
int m = code == CMP_EQ ? 0 : 255;
for( ; height--; src1 += step1, src2 += step2, dst += step )
{
int x = 0;
#if CV_ENABLE_UNROLLED
for( ; x <= width - 4; x += 4 )
{
int t0, t1;
t0 = -(src1[x] == src2[x]) ^ m;
t1 = -(src1[x+1] == src2[x+1]) ^ m;
dst[x] = (uchar)t0; dst[x+1] = (uchar)t1;
t0 = -(src1[x+2] == src2[x+2]) ^ m;
t1 = -(src1[x+3] == src2[x+3]) ^ m;
dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1;
}
#endif
for( ; x < width; x++ )
dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m);
}
}
}
template<typename T, typename WT> static void
mul_( const T* src1, size_t step1, const T* src2, size_t step2,
T* dst, size_t step, int width, int height, WT scale )
{
step1 /= sizeof(src1[0]);
step2 /= sizeof(src2[0]);
step /= sizeof(dst[0]);
Mul_SIMD<T, WT> vop;
if( scale == (WT)1. )
{
for( ; height--; src1 += step1, src2 += step2, dst += step )
{
int i = vop(src1, src2, dst, width, scale);
#if CV_ENABLE_UNROLLED
for(; i <= width - 4; i += 4 )
{
T t0;
T t1;
t0 = saturate_cast<T>(src1[i ] * src2[i ]);
t1 = saturate_cast<T>(src1[i+1] * src2[i+1]);
dst[i ] = t0;
dst[i+1] = t1;
t0 = saturate_cast<T>(src1[i+2] * src2[i+2]);
t1 = saturate_cast<T>(src1[i+3] * src2[i+3]);
dst[i+2] = t0;
dst[i+3] = t1;
}
#endif
for( ; i < width; i++ )
dst[i] = saturate_cast<T>(src1[i] * src2[i]);
}
}
else
{
for( ; height--; src1 += step1, src2 += step2, dst += step )
{
int i = vop(src1, src2, dst, width, scale);
#if CV_ENABLE_UNROLLED
for(; i <= width - 4; i += 4 )
{
T t0 = saturate_cast<T>(scale*(WT)src1[i]*src2[i]);
T t1 = saturate_cast<T>(scale*(WT)src1[i+1]*src2[i+1]);
dst[i] = t0; dst[i+1] = t1;
t0 = saturate_cast<T>(scale*(WT)src1[i+2]*src2[i+2]);
t1 = saturate_cast<T>(scale*(WT)src1[i+3]*src2[i+3]);
dst[i+2] = t0; dst[i+3] = t1;
}
#endif
for( ; i < width; i++ )
dst[i] = saturate_cast<T>(scale*(WT)src1[i]*src2[i]);
}
}
}
template<typename T> static void
div_i( const T* src1, size_t step1, const T* src2, size_t step2,
T* dst, size_t step, int width, int height, double scale )
{
step1 /= sizeof(src1[0]);
step2 /= sizeof(src2[0]);
step /= sizeof(dst[0]);
Div_SIMD<T> vop;
float scale_f = (float)scale;
for( ; height--; src1 += step1, src2 += step2, dst += step )
{
int i = vop(src1, src2, dst, width, scale);
for( ; i < width; i++ )
{
T num = src1[i], denom = src2[i];
dst[i] = denom != 0 ? saturate_cast<T>(num*scale_f/denom) : (T)0;
}
}
}
template<typename T> static void
div_f( const T* src1, size_t step1, const T* src2, size_t step2,
T* dst, size_t step, int width, int height, double scale )
{
T scale_f = (T)scale;
step1 /= sizeof(src1[0]);
step2 /= sizeof(src2[0]);
step /= sizeof(dst[0]);
Div_SIMD<T> vop;
for( ; height--; src1 += step1, src2 += step2, dst += step )
{
int i = vop(src1, src2, dst, width, scale);
for( ; i < width; i++ )
{
T num = src1[i], denom = src2[i];
dst[i] = denom != 0 ? saturate_cast<T>(num*scale_f/denom) : (T)0;
}
}
}
template<typename T> static void
recip_i( const T*, size_t, const T* src2, size_t step2,
T* dst, size_t step, int width, int height, double scale )
{
step2 /= sizeof(src2[0]);
step /= sizeof(dst[0]);
Recip_SIMD<T> vop;
float scale_f = (float)scale;
for( ; height--; src2 += step2, dst += step )
{
int i = vop(src2, dst, width, scale);
for( ; i < width; i++ )
{
T denom = src2[i];
dst[i] = denom != 0 ? saturate_cast<T>(scale_f/denom) : (T)0;
}
}
}
template<typename T> static void
recip_f( const T*, size_t, const T* src2, size_t step2,
T* dst, size_t step, int width, int height, double scale )
{
T scale_f = (T)scale;
step2 /= sizeof(src2[0]);
step /= sizeof(dst[0]);
Recip_SIMD<T> vop;
for( ; height--; src2 += step2, dst += step )
{
int i = vop(src2, dst, width, scale);
for( ; i < width; i++ )
{
T denom = src2[i];
dst[i] = denom != 0 ? saturate_cast<T>(scale_f/denom) : (T)0;
}
}
}
template<typename T, typename WT> static void
addWeighted_( const T* src1, size_t step1, const T* src2, size_t step2,
T* dst, size_t step, int width, int height, void* _scalars )
{
const double* scalars = (const double*)_scalars;
WT alpha = (WT)scalars[0], beta = (WT)scalars[1], gamma = (WT)scalars[2];
step1 /= sizeof(src1[0]);
step2 /= sizeof(src2[0]);
step /= sizeof(dst[0]);
AddWeighted_SIMD<T, WT> vop;
for( ; height--; src1 += step1, src2 += step2, dst += step )
{
int x = vop(src1, src2, dst, width, alpha, beta, gamma);
#if CV_ENABLE_UNROLLED
for( ; x <= width - 4; x += 4 )
{
T t0 = saturate_cast<T>(src1[x]*alpha + src2[x]*beta + gamma);
T t1 = saturate_cast<T>(src1[x+1]*alpha + src2[x+1]*beta + gamma);
dst[x] = t0; dst[x+1] = t1;
t0 = saturate_cast<T>(src1[x+2]*alpha + src2[x+2]*beta + gamma);
t1 = saturate_cast<T>(src1[x+3]*alpha + src2[x+3]*beta + gamma);
dst[x+2] = t0; dst[x+3] = t1;
}
#endif
for( ; x < width; x++ )
dst[x] = saturate_cast<T>(src1[x]*alpha + src2[x]*beta + gamma);
}
}
} // cv::
#endif // __OPENCV_ARITHM_CORE_HPP__

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,228 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Copyright (C) 2015, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __OPENCV_CORE_HAL_REPLACEMENT_HPP__
#define __OPENCV_CORE_HAL_REPLACEMENT_HPP__
#include "opencv2/core/hal/interface.h"
inline int hal_ni_add8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_add8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_add16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_add16s(const short*, size_t, const short*, size_t, short*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_add32s(const int*, size_t, const int*, size_t, int*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_add32f(const float*, size_t, const float*, size_t, float*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_add64f(const double*, size_t, const double*, size_t, double*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_sub8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_sub8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_sub16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_sub16s(const short*, size_t, const short*, size_t, short*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_sub32s(const int*, size_t, const int*, size_t, int*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_sub32f(const float*, size_t, const float*, size_t, float*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_sub64f(const double*, size_t, const double*, size_t, double*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_max8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_max8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_max16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_max16s(const short*, size_t, const short*, size_t, short*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_max32s(const int*, size_t, const int*, size_t, int*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_max32f(const float*, size_t, const float*, size_t, float*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_max64f(const double*, size_t, const double*, size_t, double*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_min8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_min8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_min16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_min16s(const short*, size_t, const short*, size_t, short*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_min32s(const int*, size_t, const int*, size_t, int*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_min32f(const float*, size_t, const float*, size_t, float*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_min64f(const double*, size_t, const double*, size_t, double*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_absdiff8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_absdiff8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_absdiff16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_absdiff16s(const short*, size_t, const short*, size_t, short*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_absdiff32s(const int*, size_t, const int*, size_t, int*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_absdiff32f(const float*, size_t, const float*, size_t, float*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_absdiff64f(const double*, size_t, const double*, size_t, double*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_and8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_or8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_xor8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_not8u(const uchar*, size_t, uchar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
#define cv_hal_add8u hal_ni_add8u
#define cv_hal_add8s hal_ni_add8s
#define cv_hal_add16u hal_ni_add16u
#define cv_hal_add16s hal_ni_add16s
#define cv_hal_add32s hal_ni_add32s
#define cv_hal_add32f hal_ni_add32f
#define cv_hal_add64f hal_ni_add64f
#define cv_hal_sub8u hal_ni_sub8u
#define cv_hal_sub8s hal_ni_sub8s
#define cv_hal_sub16u hal_ni_sub16u
#define cv_hal_sub16s hal_ni_sub16s
#define cv_hal_sub32s hal_ni_sub32s
#define cv_hal_sub32f hal_ni_sub32f
#define cv_hal_sub64f hal_ni_sub64f
#define cv_hal_max8u hal_ni_max8u
#define cv_hal_max8s hal_ni_max8s
#define cv_hal_max16u hal_ni_max16u
#define cv_hal_max16s hal_ni_max16s
#define cv_hal_max32s hal_ni_max32s
#define cv_hal_max32f hal_ni_max32f
#define cv_hal_max64f hal_ni_max64f
#define cv_hal_min8u hal_ni_min8u
#define cv_hal_min8s hal_ni_min8s
#define cv_hal_min16u hal_ni_min16u
#define cv_hal_min16s hal_ni_min16s
#define cv_hal_min32s hal_ni_min32s
#define cv_hal_min32f hal_ni_min32f
#define cv_hal_min64f hal_ni_min64f
#define cv_hal_absdiff8u hal_ni_absdiff8u
#define cv_hal_absdiff8s hal_ni_absdiff8s
#define cv_hal_absdiff16u hal_ni_absdiff16u
#define cv_hal_absdiff16s hal_ni_absdiff16s
#define cv_hal_absdiff32s hal_ni_absdiff32s
#define cv_hal_absdiff32f hal_ni_absdiff32f
#define cv_hal_absdiff64f hal_ni_absdiff64f
#define cv_hal_and8u hal_ni_and8u
#define cv_hal_or8u hal_ni_or8u
#define cv_hal_xor8u hal_ni_xor8u
#define cv_hal_not8u hal_ni_not8u
inline int hal_ni_cmp8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_cmp8s(const schar*, size_t, const schar*, size_t, uchar*, size_t, int, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_cmp16u(const ushort*, size_t, const ushort*, size_t, uchar*, size_t, int, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_cmp16s(const short*, size_t, const short*, size_t, uchar*, size_t, int, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_cmp32s(const int*, size_t, const int*, size_t, uchar*, size_t, int, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_cmp32f(const float*, size_t, const float*, size_t, uchar*, size_t, int, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_cmp64f(const double*, size_t, const double*, size_t, uchar*, size_t, int, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
#define cv_hal_cmp8u hal_ni_cmp8u
#define cv_hal_cmp8s hal_ni_cmp8s
#define cv_hal_cmp16u hal_ni_cmp16u
#define cv_hal_cmp16s hal_ni_cmp16s
#define cv_hal_cmp32s hal_ni_cmp32s
#define cv_hal_cmp32f hal_ni_cmp32f
#define cv_hal_cmp64f hal_ni_cmp64f
inline int hal_ni_mul8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_mul8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_mul16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_mul16s(const short*, size_t, const short*, size_t, short*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_mul32s(const int*, size_t, const int*, size_t, int*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_mul32f(const float*, size_t, const float*, size_t, float*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_mul64f(const double*, size_t, const double*, size_t, double*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_div8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_div8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_div16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_div16s(const short*, size_t, const short*, size_t, short*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_div32s(const int*, size_t, const int*, size_t, int*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_div32f(const float*, size_t, const float*, size_t, float*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_div64f(const double*, size_t, const double*, size_t, double*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_recip8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_recip8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_recip16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_recip16s(const short*, size_t, const short*, size_t, short*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_recip32s(const int*, size_t, const int*, size_t, int*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_recip32f(const float*, size_t, const float*, size_t, float*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_recip64f(const double*, size_t, const double*, size_t, double*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
#define cv_hal_mul8u hal_ni_mul8u
#define cv_hal_mul8s hal_ni_mul8s
#define cv_hal_mul16u hal_ni_mul16u
#define cv_hal_mul16s hal_ni_mul16s
#define cv_hal_mul32s hal_ni_mul32s
#define cv_hal_mul32f hal_ni_mul32f
#define cv_hal_mul64f hal_ni_mul64f
#define cv_hal_div8u hal_ni_div8u
#define cv_hal_div8s hal_ni_div8s
#define cv_hal_div16u hal_ni_div16u
#define cv_hal_div16s hal_ni_div16s
#define cv_hal_div32s hal_ni_div32s
#define cv_hal_div32f hal_ni_div32f
#define cv_hal_div64f hal_ni_div64f
#define cv_hal_recip8u hal_ni_recip8u
#define cv_hal_recip8s hal_ni_recip8s
#define cv_hal_recip16u hal_ni_recip16u
#define cv_hal_recip16s hal_ni_recip16s
#define cv_hal_recip32s hal_ni_recip32s
#define cv_hal_recip32f hal_ni_recip32f
#define cv_hal_recip64f hal_ni_recip64f
inline int hal_ni_addWeighted8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int, const double*) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_addWeighted8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int, const double*) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_addWeighted16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int, const double*) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_addWeighted16s(const short*, size_t, const short*, size_t, short*, size_t, int, int, const double*) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_addWeighted32s(const int*, size_t, const int*, size_t, int*, size_t, int, int, const double*) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_addWeighted32f(const float*, size_t, const float*, size_t, float*, size_t, int, int, const double*) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_addWeighted64f(const double*, size_t, const double*, size_t, double*, size_t, int, int, const double*) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
#define cv_hal_addWeighted8u hal_ni_addWeighted8u
#define cv_hal_addWeighted8s hal_ni_addWeighted8s
#define cv_hal_addWeighted16u hal_ni_addWeighted16u
#define cv_hal_addWeighted16s hal_ni_addWeighted16s
#define cv_hal_addWeighted32s hal_ni_addWeighted32s
#define cv_hal_addWeighted32f hal_ni_addWeighted32f
#define cv_hal_addWeighted64f hal_ni_addWeighted64f
inline int hal_ni_split8u(const uchar*, uchar**, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_split16u(const ushort*, ushort**, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_split32s(const int*, int**, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_split64s(const int64*, int64**, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
#define cv_hal_split8u hal_ni_split8u
#define cv_hal_split16u hal_ni_split16u
#define cv_hal_split32s hal_ni_split32s
#define cv_hal_split64s hal_ni_split64s
inline int hal_ni_merge8u(const uchar**, uchar*, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_merge16u(const ushort**, ushort*, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_merge32s(const int**, int*, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_merge64s(const int64**, int64*, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
#define cv_hal_merge8u hal_ni_merge8u
#define cv_hal_merge16u hal_ni_merge16u
#define cv_hal_merge32s hal_ni_merge32s
#define cv_hal_merge64s hal_ni_merge64s
#include "custom_hal.hpp"
#endif

View File

@@ -52,22 +52,22 @@ namespace cv
int LU(float* A, size_t astep, int m, float* b, size_t bstep, int n)
{
return hal::LU(A, astep, m, b, bstep, n);
return hal::LU32f(A, astep, m, b, bstep, n);
}
int LU(double* A, size_t astep, int m, double* b, size_t bstep, int n)
{
return hal::LU(A, astep, m, b, bstep, n);
return hal::LU64f(A, astep, m, b, bstep, n);
}
bool Cholesky(float* A, size_t astep, int m, float* b, size_t bstep, int n)
{
return hal::Cholesky(A, astep, m, b, bstep, n);
return hal::Cholesky32f(A, astep, m, b, bstep, n);
}
bool Cholesky(double* A, size_t astep, int m, double* b, size_t bstep, int n)
{
return hal::Cholesky(A, astep, m, b, bstep, n);
return hal::Cholesky64f(A, astep, m, b, bstep, n);
}
template<typename _Tp> static inline _Tp hypot(_Tp a, _Tp b)
@@ -740,7 +740,7 @@ double cv::determinant( InputArray _mat )
Mat a(rows, rows, CV_32F, (uchar*)buffer);
mat.copyTo(a);
result = hal::LU(a.ptr<float>(), a.step, rows, 0, 0, 0);
result = hal::LU32f(a.ptr<float>(), a.step, rows, 0, 0, 0);
if( result )
{
for( int i = 0; i < rows; i++ )
@@ -764,7 +764,7 @@ double cv::determinant( InputArray _mat )
Mat a(rows, rows, CV_64F, (uchar*)buffer);
mat.copyTo(a);
result = hal::LU(a.ptr<double>(), a.step, rows, 0, 0, 0);
result = hal::LU64f(a.ptr<double>(), a.step, rows, 0, 0, 0);
if( result )
{
for( int i = 0; i < rows; i++ )
@@ -1027,13 +1027,13 @@ double cv::invert( InputArray _src, OutputArray _dst, int method )
setIdentity(dst);
if( method == DECOMP_LU && type == CV_32F )
result = hal::LU(src1.ptr<float>(), src1.step, n, dst.ptr<float>(), dst.step, n) != 0;
result = hal::LU32f(src1.ptr<float>(), src1.step, n, dst.ptr<float>(), dst.step, n) != 0;
else if( method == DECOMP_LU && type == CV_64F )
result = hal::LU(src1.ptr<double>(), src1.step, n, dst.ptr<double>(), dst.step, n) != 0;
result = hal::LU64f(src1.ptr<double>(), src1.step, n, dst.ptr<double>(), dst.step, n) != 0;
else if( method == DECOMP_CHOLESKY && type == CV_32F )
result = hal::Cholesky(src1.ptr<float>(), src1.step, n, dst.ptr<float>(), dst.step, n);
result = hal::Cholesky32f(src1.ptr<float>(), src1.step, n, dst.ptr<float>(), dst.step, n);
else
result = hal::Cholesky(src1.ptr<double>(), src1.step, n, dst.ptr<double>(), dst.step, n);
result = hal::Cholesky64f(src1.ptr<double>(), src1.step, n, dst.ptr<double>(), dst.step, n);
if( !result )
dst = Scalar(0);
@@ -1265,16 +1265,16 @@ bool cv::solve( InputArray _src, InputArray _src2arg, OutputArray _dst, int meth
if( method == DECOMP_LU )
{
if( type == CV_32F )
result = hal::LU(a.ptr<float>(), a.step, n, dst.ptr<float>(), dst.step, nb) != 0;
result = hal::LU32f(a.ptr<float>(), a.step, n, dst.ptr<float>(), dst.step, nb) != 0;
else
result = hal::LU(a.ptr<double>(), a.step, n, dst.ptr<double>(), dst.step, nb) != 0;
result = hal::LU64f(a.ptr<double>(), a.step, n, dst.ptr<double>(), dst.step, nb) != 0;
}
else if( method == DECOMP_CHOLESKY )
{
if( type == CV_32F )
result = hal::Cholesky(a.ptr<float>(), a.step, n, dst.ptr<float>(), dst.step, nb);
result = hal::Cholesky32f(a.ptr<float>(), a.step, n, dst.ptr<float>(), dst.step, nb);
else
result = hal::Cholesky(a.ptr<double>(), a.step, n, dst.ptr<double>(), dst.step, nb);
result = hal::Cholesky64f(a.ptr<double>(), a.step, n, dst.ptr<double>(), dst.step, nb);
}
else
{

View File

@@ -191,13 +191,13 @@ void magnitude( InputArray src1, InputArray src2, OutputArray dst )
{
const float *x = (const float*)ptrs[0], *y = (const float*)ptrs[1];
float *mag = (float*)ptrs[2];
hal::magnitude( x, y, mag, len );
hal::magnitude32f( x, y, mag, len );
}
else
{
const double *x = (const double*)ptrs[0], *y = (const double*)ptrs[1];
double *mag = (double*)ptrs[2];
hal::magnitude( x, y, mag, len );
hal::magnitude64f( x, y, mag, len );
}
}
}
@@ -374,7 +374,7 @@ void cartToPolar( InputArray src1, InputArray src2,
{
const float *x = (const float*)ptrs[0], *y = (const float*)ptrs[1];
float *mag = (float*)ptrs[2], *angle = (float*)ptrs[3];
hal::magnitude( x, y, mag, len );
hal::magnitude32f( x, y, mag, len );
hal::fastAtan2( y, x, angle, len, angleInDegrees );
}
else
@@ -382,7 +382,7 @@ void cartToPolar( InputArray src1, InputArray src2,
const double *x = (const double*)ptrs[0], *y = (const double*)ptrs[1];
double *angle = (double*)ptrs[3];
hal::magnitude(x, y, (double*)ptrs[2], len);
hal::magnitude64f(x, y, (double*)ptrs[2], len);
k = 0;
#if CV_SSE2
@@ -760,7 +760,7 @@ static void Exp_32f_ipp(const float *x, float *y, int n)
}
setIppErrorStatus();
}
hal::exp(x, y, n);
hal::exp32f(x, y, n);
}
static void Exp_64f_ipp(const double *x, double *y, int n)
@@ -774,14 +774,14 @@ static void Exp_64f_ipp(const double *x, double *y, int n)
}
setIppErrorStatus();
}
hal::exp(x, y, n);
hal::exp64f(x, y, n);
}
#define Exp_32f Exp_32f_ipp
#define Exp_64f Exp_64f_ipp
#else
#define Exp_32f hal::exp
#define Exp_64f hal::exp
#define Exp_32f hal::exp32f
#define Exp_64f hal::exp64f
#endif
@@ -828,7 +828,7 @@ static void Log_32f_ipp(const float *x, float *y, int n)
}
setIppErrorStatus();
}
hal::log(x, y, n);
hal::log32f(x, y, n);
}
static void Log_64f_ipp(const double *x, double *y, int n)
@@ -842,14 +842,14 @@ static void Log_64f_ipp(const double *x, double *y, int n)
}
setIppErrorStatus();
}
hal::log(x, y, n);
hal::log64f(x, y, n);
}
#define Log_32f Log_32f_ipp
#define Log_64f Log_64f_ipp
#else
#define Log_32f hal::log
#define Log_64f hal::log
#define Log_32f hal::log32f
#define Log_64f hal::log64f
#endif
void log( InputArray _src, OutputArray _dst )
@@ -1356,10 +1356,10 @@ static bool ocl_pow(InputArray _src, double power, OutputArray _dst,
#endif
static void InvSqrt_32f(const float* src, float* dst, int n) { hal::invSqrt(src, dst, n); }
static void InvSqrt_64f(const double* src, double* dst, int n) { hal::invSqrt(src, dst, n); }
static void Sqrt_32f(const float* src, float* dst, int n) { hal::sqrt(src, dst, n); }
static void Sqrt_64f(const double* src, double* dst, int n) { hal::sqrt(src, dst, n); }
static void InvSqrt_32f(const float* src, float* dst, int n) { hal::invSqrt32f(src, dst, n); }
static void InvSqrt_64f(const double* src, double* dst, int n) { hal::invSqrt64f(src, dst, n); }
static void Sqrt_32f(const float* src, float* dst, int n) { hal::sqrt32f(src, dst, n); }
static void Sqrt_64f(const double* src, double* dst, int n) { hal::sqrt64f(src, dst, n); }
void pow( InputArray _src, double power, OutputArray _dst )
{

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,231 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
namespace cv { namespace hal {
/****************************************************************************************\
* LU & Cholesky implementation for small matrices *
\****************************************************************************************/
template<typename _Tp> static inline int
LUImpl(_Tp* A, size_t astep, int m, _Tp* b, size_t bstep, int n, _Tp eps)
{
int i, j, k, p = 1;
astep /= sizeof(A[0]);
bstep /= sizeof(b[0]);
for( i = 0; i < m; i++ )
{
k = i;
for( j = i+1; j < m; j++ )
if( std::abs(A[j*astep + i]) > std::abs(A[k*astep + i]) )
k = j;
if( std::abs(A[k*astep + i]) < eps )
return 0;
if( k != i )
{
for( j = i; j < m; j++ )
std::swap(A[i*astep + j], A[k*astep + j]);
if( b )
for( j = 0; j < n; j++ )
std::swap(b[i*bstep + j], b[k*bstep + j]);
p = -p;
}
_Tp d = -1/A[i*astep + i];
for( j = i+1; j < m; j++ )
{
_Tp alpha = A[j*astep + i]*d;
for( k = i+1; k < m; k++ )
A[j*astep + k] += alpha*A[i*astep + k];
if( b )
for( k = 0; k < n; k++ )
b[j*bstep + k] += alpha*b[i*bstep + k];
}
A[i*astep + i] = -d;
}
if( b )
{
for( i = m-1; i >= 0; i-- )
for( j = 0; j < n; j++ )
{
_Tp s = b[i*bstep + j];
for( k = i+1; k < m; k++ )
s -= A[i*astep + k]*b[k*bstep + j];
b[i*bstep + j] = s*A[i*astep + i];
}
}
return p;
}
int LU32f(float* A, size_t astep, int m, float* b, size_t bstep, int n)
{
return LUImpl(A, astep, m, b, bstep, n, FLT_EPSILON*10);
}
int LU64f(double* A, size_t astep, int m, double* b, size_t bstep, int n)
{
return LUImpl(A, astep, m, b, bstep, n, DBL_EPSILON*100);
}
template<typename _Tp> static inline bool
CholImpl(_Tp* A, size_t astep, int m, _Tp* b, size_t bstep, int n)
{
_Tp* L = A;
int i, j, k;
double s;
astep /= sizeof(A[0]);
bstep /= sizeof(b[0]);
for( i = 0; i < m; i++ )
{
for( j = 0; j < i; j++ )
{
s = A[i*astep + j];
for( k = 0; k < j; k++ )
s -= L[i*astep + k]*L[j*astep + k];
L[i*astep + j] = (_Tp)(s*L[j*astep + j]);
}
s = A[i*astep + i];
for( k = 0; k < j; k++ )
{
double t = L[i*astep + k];
s -= t*t;
}
if( s < std::numeric_limits<_Tp>::epsilon() )
return false;
L[i*astep + i] = (_Tp)(1./std::sqrt(s));
}
if( !b )
return true;
// LLt x = b
// 1: L y = b
// 2. Lt x = y
/*
[ L00 ] y0 b0
[ L10 L11 ] y1 = b1
[ L20 L21 L22 ] y2 b2
[ L30 L31 L32 L33 ] y3 b3
[ L00 L10 L20 L30 ] x0 y0
[ L11 L21 L31 ] x1 = y1
[ L22 L32 ] x2 y2
[ L33 ] x3 y3
*/
for( i = 0; i < m; i++ )
{
for( j = 0; j < n; j++ )
{
s = b[i*bstep + j];
for( k = 0; k < i; k++ )
s -= L[i*astep + k]*b[k*bstep + j];
b[i*bstep + j] = (_Tp)(s*L[i*astep + i]);
}
}
for( i = m-1; i >= 0; i-- )
{
for( j = 0; j < n; j++ )
{
s = b[i*bstep + j];
for( k = m-1; k > i; k-- )
s -= L[k*astep + i]*b[k*bstep + j];
b[i*bstep + j] = (_Tp)(s*L[i*astep + i]);
}
}
return true;
}
bool Cholesky32f(float* A, size_t astep, int m, float* b, size_t bstep, int n)
{
return CholImpl(A, astep, m, b, bstep, n);
}
bool Cholesky64f(double* A, size_t astep, int m, double* b, size_t bstep, int n)
{
return CholImpl(A, astep, m, b, bstep, n);
}
//=============================================================================
// for compatibility with 3.0
int LU(float* A, size_t astep, int m, float* b, size_t bstep, int n)
{
return LUImpl(A, astep, m, b, bstep, n, FLT_EPSILON*10);
}
int LU(double* A, size_t astep, int m, double* b, size_t bstep, int n)
{
return LUImpl(A, astep, m, b, bstep, n, DBL_EPSILON*100);
}
bool Cholesky(float* A, size_t astep, int m, float* b, size_t bstep, int n)
{
return CholImpl(A, astep, m, b, bstep, n);
}
bool Cholesky(double* A, size_t astep, int m, double* b, size_t bstep, int n)
{
return CholImpl(A, astep, m, b, bstep, n);
}
}}

412
modules/core/src/merge.cpp Normal file
View File

@@ -0,0 +1,412 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
namespace cv { namespace hal {
#if CV_NEON
template<typename T> struct VMerge2;
template<typename T> struct VMerge3;
template<typename T> struct VMerge4;
#define MERGE2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \
template<> \
struct name<data_type>{ \
void operator()(const data_type* src0, const data_type* src1, \
data_type* dst){ \
reg_type r; \
r.val[0] = load_func(src0); \
r.val[1] = load_func(src1); \
store_func(dst, r); \
} \
}
#define MERGE3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \
template<> \
struct name<data_type>{ \
void operator()(const data_type* src0, const data_type* src1, \
const data_type* src2, data_type* dst){ \
reg_type r; \
r.val[0] = load_func(src0); \
r.val[1] = load_func(src1); \
r.val[2] = load_func(src2); \
store_func(dst, r); \
} \
}
#define MERGE4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \
template<> \
struct name<data_type>{ \
void operator()(const data_type* src0, const data_type* src1, \
const data_type* src2, const data_type* src3, \
data_type* dst){ \
reg_type r; \
r.val[0] = load_func(src0); \
r.val[1] = load_func(src1); \
r.val[2] = load_func(src2); \
r.val[3] = load_func(src3); \
store_func(dst, r); \
} \
}
MERGE2_KERNEL_TEMPLATE(VMerge2, uchar , uint8x16x2_t, vld1q_u8 , vst2q_u8 );
MERGE2_KERNEL_TEMPLATE(VMerge2, ushort, uint16x8x2_t, vld1q_u16, vst2q_u16);
MERGE2_KERNEL_TEMPLATE(VMerge2, int , int32x4x2_t, vld1q_s32, vst2q_s32);
MERGE2_KERNEL_TEMPLATE(VMerge2, int64 , int64x1x2_t, vld1_s64 , vst2_s64 );
MERGE3_KERNEL_TEMPLATE(VMerge3, uchar , uint8x16x3_t, vld1q_u8 , vst3q_u8 );
MERGE3_KERNEL_TEMPLATE(VMerge3, ushort, uint16x8x3_t, vld1q_u16, vst3q_u16);
MERGE3_KERNEL_TEMPLATE(VMerge3, int , int32x4x3_t, vld1q_s32, vst3q_s32);
MERGE3_KERNEL_TEMPLATE(VMerge3, int64 , int64x1x3_t, vld1_s64 , vst3_s64 );
MERGE4_KERNEL_TEMPLATE(VMerge4, uchar , uint8x16x4_t, vld1q_u8 , vst4q_u8 );
MERGE4_KERNEL_TEMPLATE(VMerge4, ushort, uint16x8x4_t, vld1q_u16, vst4q_u16);
MERGE4_KERNEL_TEMPLATE(VMerge4, int , int32x4x4_t, vld1q_s32, vst4q_s32);
MERGE4_KERNEL_TEMPLATE(VMerge4, int64 , int64x1x4_t, vld1_s64 , vst4_s64 );
#elif CV_SSE2
template <typename T>
struct VMerge2
{
VMerge2() : support(false) { }
void operator()(const T *, const T *, T *) const { }
bool support;
};
template <typename T>
struct VMerge3
{
VMerge3() : support(false) { }
void operator()(const T *, const T *, const T *, T *) const { }
bool support;
};
template <typename T>
struct VMerge4
{
VMerge4() : support(false) { }
void operator()(const T *, const T *, const T *, const T *, T *) const { }
bool support;
};
#define MERGE2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \
template <> \
struct VMerge2<data_type> \
{ \
enum \
{ \
ELEMS_IN_VEC = 16 / sizeof(data_type) \
}; \
\
VMerge2() \
{ \
support = checkHardwareSupport(se); \
} \
\
void operator()(const data_type * src0, const data_type * src1, \
data_type * dst) const \
{ \
reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0)); \
reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC)); \
reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1)); \
reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC)); \
\
_mm_interleave(v_src0, v_src1, v_src2, v_src3); \
\
_mm_storeu_##flavor((cast_type *)(dst), v_src0); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3); \
} \
\
bool support; \
}
#define MERGE3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \
template <> \
struct VMerge3<data_type> \
{ \
enum \
{ \
ELEMS_IN_VEC = 16 / sizeof(data_type) \
}; \
\
VMerge3() \
{ \
support = checkHardwareSupport(se); \
} \
\
void operator()(const data_type * src0, const data_type * src1, const data_type * src2,\
data_type * dst) const \
{ \
reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0)); \
reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC)); \
reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1)); \
reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC)); \
reg_type v_src4 = _mm_loadu_##flavor((const cast_type *)(src2)); \
reg_type v_src5 = _mm_loadu_##flavor((const cast_type *)(src2 + ELEMS_IN_VEC)); \
\
_mm_interleave(v_src0, v_src1, v_src2, \
v_src3, v_src4, v_src5); \
\
_mm_storeu_##flavor((cast_type *)(dst), v_src0); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 4), v_src4); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 5), v_src5); \
} \
\
bool support; \
}
#define MERGE4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \
template <> \
struct VMerge4<data_type> \
{ \
enum \
{ \
ELEMS_IN_VEC = 16 / sizeof(data_type) \
}; \
\
VMerge4() \
{ \
support = checkHardwareSupport(se); \
} \
\
void operator()(const data_type * src0, const data_type * src1, \
const data_type * src2, const data_type * src3, \
data_type * dst) const \
{ \
reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0)); \
reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC)); \
reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1)); \
reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC)); \
reg_type v_src4 = _mm_loadu_##flavor((const cast_type *)(src2)); \
reg_type v_src5 = _mm_loadu_##flavor((const cast_type *)(src2 + ELEMS_IN_VEC)); \
reg_type v_src6 = _mm_loadu_##flavor((const cast_type *)(src3)); \
reg_type v_src7 = _mm_loadu_##flavor((const cast_type *)(src3 + ELEMS_IN_VEC)); \
\
_mm_interleave(v_src0, v_src1, v_src2, v_src3, \
v_src4, v_src5, v_src6, v_src7); \
\
_mm_storeu_##flavor((cast_type *)(dst), v_src0); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 4), v_src4); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 5), v_src5); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 6), v_src6); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 7), v_src7); \
} \
\
bool support; \
}
MERGE2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2);
MERGE3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2);
MERGE4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2);
#if CV_SSE4_1
MERGE2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1);
MERGE3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1);
MERGE4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1);
#endif
MERGE2_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps, CV_CPU_SSE2);
MERGE3_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps, CV_CPU_SSE2);
MERGE4_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps, CV_CPU_SSE2);
#endif
template<typename T> static void
merge_( const T** src, T* dst, int len, int cn )
{
int k = cn % 4 ? cn % 4 : 4;
int i, j;
if( k == 1 )
{
const T* src0 = src[0];
for( i = j = 0; i < len; i++, j += cn )
dst[j] = src0[i];
}
else if( k == 2 )
{
const T *src0 = src[0], *src1 = src[1];
i = j = 0;
#if CV_NEON
if(cn == 2)
{
int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
int inc_j = 2 * inc_i;
VMerge2<T> vmerge;
for( ; i < len - inc_i; i += inc_i, j += inc_j)
vmerge(src0 + i, src1 + i, dst + j);
}
#elif CV_SSE2
if(cn == 2)
{
int inc_i = 32/sizeof(T);
int inc_j = 2 * inc_i;
VMerge2<T> vmerge;
if (vmerge.support)
for( ; i < len - inc_i; i += inc_i, j += inc_j)
vmerge(src0 + i, src1 + i, dst + j);
}
#endif
for( ; i < len; i++, j += cn )
{
dst[j] = src0[i];
dst[j+1] = src1[i];
}
}
else if( k == 3 )
{
const T *src0 = src[0], *src1 = src[1], *src2 = src[2];
i = j = 0;
#if CV_NEON
if(cn == 3)
{
int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
int inc_j = 3 * inc_i;
VMerge3<T> vmerge;
for( ; i < len - inc_i; i += inc_i, j += inc_j)
vmerge(src0 + i, src1 + i, src2 + i, dst + j);
}
#elif CV_SSE2
if(cn == 3)
{
int inc_i = 32/sizeof(T);
int inc_j = 3 * inc_i;
VMerge3<T> vmerge;
if (vmerge.support)
for( ; i < len - inc_i; i += inc_i, j += inc_j)
vmerge(src0 + i, src1 + i, src2 + i, dst + j);
}
#endif
for( ; i < len; i++, j += cn )
{
dst[j] = src0[i];
dst[j+1] = src1[i];
dst[j+2] = src2[i];
}
}
else
{
const T *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3];
i = j = 0;
#if CV_NEON
if(cn == 4)
{
int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
int inc_j = 4 * inc_i;
VMerge4<T> vmerge;
for( ; i < len - inc_i; i += inc_i, j += inc_j)
vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j);
}
#elif CV_SSE2
if(cn == 4)
{
int inc_i = 32/sizeof(T);
int inc_j = 4 * inc_i;
VMerge4<T> vmerge;
if (vmerge.support)
for( ; i < len - inc_i; i += inc_i, j += inc_j)
vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j);
}
#endif
for( ; i < len; i++, j += cn )
{
dst[j] = src0[i]; dst[j+1] = src1[i];
dst[j+2] = src2[i]; dst[j+3] = src3[i];
}
}
for( ; k < cn; k += 4 )
{
const T *src0 = src[k], *src1 = src[k+1], *src2 = src[k+2], *src3 = src[k+3];
for( i = 0, j = k; i < len; i++, j += cn )
{
dst[j] = src0[i]; dst[j+1] = src1[i];
dst[j+2] = src2[i]; dst[j+3] = src3[i];
}
}
}
void merge8u(const uchar** src, uchar* dst, int len, int cn )
{
CALL_HAL(merge8u, cv_hal_merge8u, src, dst, len, cn)
merge_(src, dst, len, cn);
}
void merge16u(const ushort** src, ushort* dst, int len, int cn )
{
CALL_HAL(merge16u, cv_hal_merge16u, src, dst, len, cn)
merge_(src, dst, len, cn);
}
void merge32s(const int** src, int* dst, int len, int cn )
{
CALL_HAL(merge32s, cv_hal_merge32s, src, dst, len, cn)
merge_(src, dst, len, cn);
}
void merge64s(const int64** src, int64* dst, int len, int cn )
{
CALL_HAL(merge64s, cv_hal_merge64s, src, dst, len, cn)
merge_(src, dst, len, cn);
}
}}

View File

@@ -58,8 +58,6 @@
#include "opencv2/core/ocl.hpp"
#endif
#include "opencv2/hal.hpp"
#include <assert.h>
#include <ctype.h>
#include <float.h>
@@ -69,6 +67,27 @@
#include <stdlib.h>
#include <string.h>
#include <algorithm>
#include <cmath>
#include <cstdlib>
#include <limits>
#include <float.h>
#include <cstring>
#include <cassert>
#define USE_SSE2 (cv::checkHardwareSupport(CV_CPU_SSE))
#define USE_SSE4_2 (cv::checkHardwareSupport(CV_CPU_SSE4_2))
#define USE_AVX (cv::checkHardwareSupport(CV_CPU_AVX))
#define USE_AVX2 (cv::checkHardwareSupport(CV_CPU_AVX2))
#include "opencv2/core/hal/hal.hpp"
#include "opencv2/core/hal/intrin.hpp"
#include "opencv2/core/sse_utils.hpp"
#include "opencv2/core/neon_utils.hpp"
#include "arithm_core.hpp"
#include "hal_replacement.hpp"
#ifdef HAVE_TEGRA_OPTIMIZATION
#include "opencv2/core/core_tegra.hpp"
#else
@@ -78,6 +97,34 @@
namespace cv
{
// -128.f ... 255.f
extern const float g_8x32fTab[];
#define CV_8TO32F(x) cv::g_8x32fTab[(x)+128]
extern const ushort g_8x16uSqrTab[];
#define CV_SQR_8U(x) cv::g_8x16uSqrTab[(x)+255]
extern const uchar g_Saturate8u[];
#define CV_FAST_CAST_8U(t) (assert(-256 <= (t) && (t) <= 512), cv::g_Saturate8u[(t)+256])
#define CV_MIN_8U(a,b) ((a) - CV_FAST_CAST_8U((a) - (b)))
#define CV_MAX_8U(a,b) ((a) + CV_FAST_CAST_8U((b) - (a)))
template<> inline uchar OpAdd<uchar>::operator ()(uchar a, uchar b) const
{ return CV_FAST_CAST_8U(a + b); }
template<> inline uchar OpSub<uchar>::operator ()(uchar a, uchar b) const
{ return CV_FAST_CAST_8U(a - b); }
template<> inline short OpAbsDiff<short>::operator ()(short a, short b) const
{ return saturate_cast<short>(std::abs(a - b)); }
template<> inline schar OpAbsDiff<schar>::operator ()(schar a, schar b) const
{ return saturate_cast<schar>(std::abs(a - b)); }
template<> inline uchar OpMin<uchar>::operator ()(uchar a, uchar b) const { return CV_MIN_8U(a, b); }
template<> inline uchar OpMax<uchar>::operator ()(uchar a, uchar b) const { return CV_MAX_8U(a, b); }
typedef void (*BinaryFunc)(const uchar* src1, size_t step1,
const uchar* src2, size_t step2,
uchar* dst, size_t step, Size sz,
@@ -100,21 +147,6 @@ BinaryFunc getCopyMaskFunc(size_t esz);
/* maximal average node_count/hash_size ratio beyond which hash table is resized */
#define CV_SPARSE_HASH_RATIO 3
// -128.f ... 255.f
extern const float g_8x32fTab[];
#define CV_8TO32F(x) cv::g_8x32fTab[(x)+128]
extern const ushort g_8x16uSqrTab[];
#define CV_SQR_8U(x) cv::g_8x16uSqrTab[(x)+255]
extern const uchar g_Saturate8u[];
#define CV_FAST_CAST_8U(t) (assert(-256 <= (t) && (t) <= 512), cv::g_Saturate8u[(t)+256])
#define CV_MIN_8U(a,b) ((a) - CV_FAST_CAST_8U((a) - (b)))
#define CV_MAX_8U(a,b) ((a) + CV_FAST_CAST_8U((b) - (a)))
#if defined WIN32 || defined _WIN32
void deleteThreadAllocData();
#endif
@@ -282,6 +314,4 @@ cv::Mutex& getInitializationMutex();
}
#include "opencv2/hal/intrin.hpp"
#endif /*_CXCORE_INTERNAL_H_*/

428
modules/core/src/split.cpp Normal file
View File

@@ -0,0 +1,428 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
namespace cv { namespace hal {
#if CV_NEON
template<typename T> struct VSplit2;
template<typename T> struct VSplit3;
template<typename T> struct VSplit4;
#define SPLIT2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \
template<> \
struct name<data_type> \
{ \
void operator()(const data_type* src, data_type* dst0, \
data_type* dst1) const \
{ \
reg_type r = load_func(src); \
store_func(dst0, r.val[0]); \
store_func(dst1, r.val[1]); \
} \
}
#define SPLIT3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \
template<> \
struct name<data_type> \
{ \
void operator()(const data_type* src, data_type* dst0, data_type* dst1, \
data_type* dst2) const \
{ \
reg_type r = load_func(src); \
store_func(dst0, r.val[0]); \
store_func(dst1, r.val[1]); \
store_func(dst2, r.val[2]); \
} \
}
#define SPLIT4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \
template<> \
struct name<data_type> \
{ \
void operator()(const data_type* src, data_type* dst0, data_type* dst1, \
data_type* dst2, data_type* dst3) const \
{ \
reg_type r = load_func(src); \
store_func(dst0, r.val[0]); \
store_func(dst1, r.val[1]); \
store_func(dst2, r.val[2]); \
store_func(dst3, r.val[3]); \
} \
}
SPLIT2_KERNEL_TEMPLATE(VSplit2, uchar , uint8x16x2_t, vld2q_u8 , vst1q_u8 );
SPLIT2_KERNEL_TEMPLATE(VSplit2, ushort, uint16x8x2_t, vld2q_u16, vst1q_u16);
SPLIT2_KERNEL_TEMPLATE(VSplit2, int , int32x4x2_t, vld2q_s32, vst1q_s32);
SPLIT2_KERNEL_TEMPLATE(VSplit2, int64 , int64x1x2_t, vld2_s64 , vst1_s64 );
SPLIT3_KERNEL_TEMPLATE(VSplit3, uchar , uint8x16x3_t, vld3q_u8 , vst1q_u8 );
SPLIT3_KERNEL_TEMPLATE(VSplit3, ushort, uint16x8x3_t, vld3q_u16, vst1q_u16);
SPLIT3_KERNEL_TEMPLATE(VSplit3, int , int32x4x3_t, vld3q_s32, vst1q_s32);
SPLIT3_KERNEL_TEMPLATE(VSplit3, int64 , int64x1x3_t, vld3_s64 , vst1_s64 );
SPLIT4_KERNEL_TEMPLATE(VSplit4, uchar , uint8x16x4_t, vld4q_u8 , vst1q_u8 );
SPLIT4_KERNEL_TEMPLATE(VSplit4, ushort, uint16x8x4_t, vld4q_u16, vst1q_u16);
SPLIT4_KERNEL_TEMPLATE(VSplit4, int , int32x4x4_t, vld4q_s32, vst1q_s32);
SPLIT4_KERNEL_TEMPLATE(VSplit4, int64 , int64x1x4_t, vld4_s64 , vst1_s64 );
#elif CV_SSE2
template <typename T>
struct VSplit2
{
VSplit2() : support(false) { }
void operator()(const T *, T *, T *) const { }
bool support;
};
template <typename T>
struct VSplit3
{
VSplit3() : support(false) { }
void operator()(const T *, T *, T *, T *) const { }
bool support;
};
template <typename T>
struct VSplit4
{
VSplit4() : support(false) { }
void operator()(const T *, T *, T *, T *, T *) const { }
bool support;
};
#define SPLIT2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \
template <> \
struct VSplit2<data_type> \
{ \
enum \
{ \
ELEMS_IN_VEC = 16 / sizeof(data_type) \
}; \
\
VSplit2() \
{ \
support = checkHardwareSupport(CV_CPU_SSE2); \
} \
\
void operator()(const data_type * src, \
data_type * dst0, data_type * dst1) const \
{ \
reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \
reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \
reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
\
_mm_deinterleave(v_src0, v_src1, v_src2, v_src3); \
\
_mm_storeu_##flavor((cast_type *)(dst0), v_src0); \
_mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \
_mm_storeu_##flavor((cast_type *)(dst1), v_src2); \
_mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \
} \
\
bool support; \
}
#define SPLIT3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \
template <> \
struct VSplit3<data_type> \
{ \
enum \
{ \
ELEMS_IN_VEC = 16 / sizeof(data_type) \
}; \
\
VSplit3() \
{ \
support = checkHardwareSupport(CV_CPU_SSE2); \
} \
\
void operator()(const data_type * src, \
data_type * dst0, data_type * dst1, data_type * dst2) const \
{ \
reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \
reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \
reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \
reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \
\
_mm_deinterleave(v_src0, v_src1, v_src2, \
v_src3, v_src4, v_src5); \
\
_mm_storeu_##flavor((cast_type *)(dst0), v_src0); \
_mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \
_mm_storeu_##flavor((cast_type *)(dst1), v_src2); \
_mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \
_mm_storeu_##flavor((cast_type *)(dst2), v_src4); \
_mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5); \
} \
\
bool support; \
}
#define SPLIT4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \
template <> \
struct VSplit4<data_type> \
{ \
enum \
{ \
ELEMS_IN_VEC = 16 / sizeof(data_type) \
}; \
\
VSplit4() \
{ \
support = checkHardwareSupport(CV_CPU_SSE2); \
} \
\
void operator()(const data_type * src, data_type * dst0, data_type * dst1, \
data_type * dst2, data_type * dst3) const \
{ \
reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \
reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \
reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \
reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \
reg_type v_src6 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 6)); \
reg_type v_src7 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 7)); \
\
_mm_deinterleave(v_src0, v_src1, v_src2, v_src3, \
v_src4, v_src5, v_src6, v_src7); \
\
_mm_storeu_##flavor((cast_type *)(dst0), v_src0); \
_mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \
_mm_storeu_##flavor((cast_type *)(dst1), v_src2); \
_mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \
_mm_storeu_##flavor((cast_type *)(dst2), v_src4); \
_mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5); \
_mm_storeu_##flavor((cast_type *)(dst3), v_src6); \
_mm_storeu_##flavor((cast_type *)(dst3 + ELEMS_IN_VEC), v_src7); \
} \
\
bool support; \
}
SPLIT2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128);
SPLIT2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128);
SPLIT2_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps);
SPLIT3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128);
SPLIT3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128);
SPLIT3_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps);
SPLIT4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128);
SPLIT4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128);
SPLIT4_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps);
#endif
template<typename T> static void
split_( const T* src, T** dst, int len, int cn )
{
int k = cn % 4 ? cn % 4 : 4;
int i, j;
if( k == 1 )
{
T* dst0 = dst[0];
if(cn == 1)
{
memcpy(dst0, src, len * sizeof(T));
}
else
{
for( i = 0, j = 0 ; i < len; i++, j += cn )
dst0[i] = src[j];
}
}
else if( k == 2 )
{
T *dst0 = dst[0], *dst1 = dst[1];
i = j = 0;
#if CV_NEON
if(cn == 2)
{
int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
int inc_j = 2 * inc_i;
VSplit2<T> vsplit;
for( ; i < len - inc_i; i += inc_i, j += inc_j)
vsplit(src + j, dst0 + i, dst1 + i);
}
#elif CV_SSE2
if (cn == 2)
{
int inc_i = 32/sizeof(T);
int inc_j = 2 * inc_i;
VSplit2<T> vsplit;
if (vsplit.support)
{
for( ; i <= len - inc_i; i += inc_i, j += inc_j)
vsplit(src + j, dst0 + i, dst1 + i);
}
}
#endif
for( ; i < len; i++, j += cn )
{
dst0[i] = src[j];
dst1[i] = src[j+1];
}
}
else if( k == 3 )
{
T *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2];
i = j = 0;
#if CV_NEON
if(cn == 3)
{
int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
int inc_j = 3 * inc_i;
VSplit3<T> vsplit;
for( ; i <= len - inc_i; i += inc_i, j += inc_j)
vsplit(src + j, dst0 + i, dst1 + i, dst2 + i);
}
#elif CV_SSE2
if (cn == 3)
{
int inc_i = 32/sizeof(T);
int inc_j = 3 * inc_i;
VSplit3<T> vsplit;
if (vsplit.support)
{
for( ; i <= len - inc_i; i += inc_i, j += inc_j)
vsplit(src + j, dst0 + i, dst1 + i, dst2 + i);
}
}
#endif
for( ; i < len; i++, j += cn )
{
dst0[i] = src[j];
dst1[i] = src[j+1];
dst2[i] = src[j+2];
}
}
else
{
T *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2], *dst3 = dst[3];
i = j = 0;
#if CV_NEON
if(cn == 4)
{
int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
int inc_j = 4 * inc_i;
VSplit4<T> vsplit;
for( ; i <= len - inc_i; i += inc_i, j += inc_j)
vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i);
}
#elif CV_SSE2
if (cn == 4)
{
int inc_i = 32/sizeof(T);
int inc_j = 4 * inc_i;
VSplit4<T> vsplit;
if (vsplit.support)
{
for( ; i <= len - inc_i; i += inc_i, j += inc_j)
vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i);
}
}
#endif
for( ; i < len; i++, j += cn )
{
dst0[i] = src[j]; dst1[i] = src[j+1];
dst2[i] = src[j+2]; dst3[i] = src[j+3];
}
}
for( ; k < cn; k += 4 )
{
T *dst0 = dst[k], *dst1 = dst[k+1], *dst2 = dst[k+2], *dst3 = dst[k+3];
for( i = 0, j = k; i < len; i++, j += cn )
{
dst0[i] = src[j]; dst1[i] = src[j+1];
dst2[i] = src[j+2]; dst3[i] = src[j+3];
}
}
}
void split8u(const uchar* src, uchar** dst, int len, int cn )
{
CALL_HAL(split8u, cv_hal_split8u, src,dst, len, cn)
split_(src, dst, len, cn);
}
void split16u(const ushort* src, ushort** dst, int len, int cn )
{
CALL_HAL(split16u, cv_hal_split16u, src,dst, len, cn)
split_(src, dst, len, cn);
}
void split32s(const int* src, int** dst, int len, int cn )
{
CALL_HAL(split32s, cv_hal_split32s, src,dst, len, cn)
split_(src, dst, len, cn);
}
void split64s(const int64* src, int64** dst, int len, int cn )
{
CALL_HAL(split64s, cv_hal_split64s, src,dst, len, cn)
split_(src, dst, len, cn);
}
}}

View File

@@ -3996,3 +3996,266 @@ cvNorm( const void* imgA, const void* imgB, int normType, const void* maskarr )
return !maskarr ? cv::norm(a, b, normType) : cv::norm(a, b, normType, mask);
}
namespace cv { namespace hal {
static const uchar popCountTable[] =
{
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
};
static const uchar popCountTable2[] =
{
0, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3,
1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3,
1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4
};
static const uchar popCountTable4[] =
{
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
};
int normHamming(const uchar* a, int n)
{
int i = 0;
int result = 0;
#if CV_NEON
{
uint32x4_t bits = vmovq_n_u32(0);
for (; i <= n - 16; i += 16) {
uint8x16_t A_vec = vld1q_u8 (a + i);
uint8x16_t bitsSet = vcntq_u8 (A_vec);
uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
bits = vaddq_u32(bits, bitSet4);
}
uint64x2_t bitSet2 = vpaddlq_u32 (bits);
result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0);
result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2);
}
#endif
for( ; i <= n - 4; i += 4 )
result += popCountTable[a[i]] + popCountTable[a[i+1]] +
popCountTable[a[i+2]] + popCountTable[a[i+3]];
for( ; i < n; i++ )
result += popCountTable[a[i]];
return result;
}
int normHamming(const uchar* a, const uchar* b, int n)
{
int i = 0;
int result = 0;
#if CV_NEON
{
uint32x4_t bits = vmovq_n_u32(0);
for (; i <= n - 16; i += 16) {
uint8x16_t A_vec = vld1q_u8 (a + i);
uint8x16_t B_vec = vld1q_u8 (b + i);
uint8x16_t AxorB = veorq_u8 (A_vec, B_vec);
uint8x16_t bitsSet = vcntq_u8 (AxorB);
uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
bits = vaddq_u32(bits, bitSet4);
}
uint64x2_t bitSet2 = vpaddlq_u32 (bits);
result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0);
result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2);
}
#endif
for( ; i <= n - 4; i += 4 )
result += popCountTable[a[i] ^ b[i]] + popCountTable[a[i+1] ^ b[i+1]] +
popCountTable[a[i+2] ^ b[i+2]] + popCountTable[a[i+3] ^ b[i+3]];
for( ; i < n; i++ )
result += popCountTable[a[i] ^ b[i]];
return result;
}
int normHamming(const uchar* a, int n, int cellSize)
{
if( cellSize == 1 )
return normHamming(a, n);
const uchar* tab = 0;
if( cellSize == 2 )
tab = popCountTable2;
else if( cellSize == 4 )
tab = popCountTable4;
else
return -1;
int i = 0;
int result = 0;
#if CV_ENABLE_UNROLLED
for( ; i <= n - 4; i += 4 )
result += tab[a[i]] + tab[a[i+1]] + tab[a[i+2]] + tab[a[i+3]];
#endif
for( ; i < n; i++ )
result += tab[a[i]];
return result;
}
int normHamming(const uchar* a, const uchar* b, int n, int cellSize)
{
if( cellSize == 1 )
return normHamming(a, b, n);
const uchar* tab = 0;
if( cellSize == 2 )
tab = popCountTable2;
else if( cellSize == 4 )
tab = popCountTable4;
else
return -1;
int i = 0;
int result = 0;
#if CV_ENABLE_UNROLLED
for( ; i <= n - 4; i += 4 )
result += tab[a[i] ^ b[i]] + tab[a[i+1] ^ b[i+1]] +
tab[a[i+2] ^ b[i+2]] + tab[a[i+3] ^ b[i+3]];
#endif
for( ; i < n; i++ )
result += tab[a[i] ^ b[i]];
return result;
}
float normL2Sqr_(const float* a, const float* b, int n)
{
int j = 0; float d = 0.f;
#if CV_SSE
float CV_DECL_ALIGNED(16) buf[4];
__m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
for( ; j <= n - 8; j += 8 )
{
__m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
__m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
d0 = _mm_add_ps(d0, _mm_mul_ps(t0, t0));
d1 = _mm_add_ps(d1, _mm_mul_ps(t1, t1));
}
_mm_store_ps(buf, _mm_add_ps(d0, d1));
d = buf[0] + buf[1] + buf[2] + buf[3];
#endif
{
for( ; j <= n - 4; j += 4 )
{
float t0 = a[j] - b[j], t1 = a[j+1] - b[j+1], t2 = a[j+2] - b[j+2], t3 = a[j+3] - b[j+3];
d += t0*t0 + t1*t1 + t2*t2 + t3*t3;
}
}
for( ; j < n; j++ )
{
float t = a[j] - b[j];
d += t*t;
}
return d;
}
float normL1_(const float* a, const float* b, int n)
{
int j = 0; float d = 0.f;
#if CV_SSE
float CV_DECL_ALIGNED(16) buf[4];
static const int CV_DECL_ALIGNED(16) absbuf[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
__m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
__m128 absmask = _mm_load_ps((const float*)absbuf);
for( ; j <= n - 8; j += 8 )
{
__m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
__m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
d0 = _mm_add_ps(d0, _mm_and_ps(t0, absmask));
d1 = _mm_add_ps(d1, _mm_and_ps(t1, absmask));
}
_mm_store_ps(buf, _mm_add_ps(d0, d1));
d = buf[0] + buf[1] + buf[2] + buf[3];
#elif CV_NEON
float32x4_t v_sum = vdupq_n_f32(0.0f);
for ( ; j <= n - 4; j += 4)
v_sum = vaddq_f32(v_sum, vabdq_f32(vld1q_f32(a + j), vld1q_f32(b + j)));
float CV_DECL_ALIGNED(16) buf[4];
vst1q_f32(buf, v_sum);
d = buf[0] + buf[1] + buf[2] + buf[3];
#endif
{
for( ; j <= n - 4; j += 4 )
{
d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
}
}
for( ; j < n; j++ )
d += std::abs(a[j] - b[j]);
return d;
}
int normL1_(const uchar* a, const uchar* b, int n)
{
int j = 0, d = 0;
#if CV_SSE
__m128i d0 = _mm_setzero_si128();
for( ; j <= n - 16; j += 16 )
{
__m128i t0 = _mm_loadu_si128((const __m128i*)(a + j));
__m128i t1 = _mm_loadu_si128((const __m128i*)(b + j));
d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
}
for( ; j <= n - 4; j += 4 )
{
__m128i t0 = _mm_cvtsi32_si128(*(const int*)(a + j));
__m128i t1 = _mm_cvtsi32_si128(*(const int*)(b + j));
d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
}
d = _mm_cvtsi128_si32(_mm_add_epi32(d0, _mm_unpackhi_epi64(d0, d0)));
#elif CV_NEON
uint32x4_t v_sum = vdupq_n_u32(0.0f);
for ( ; j <= n - 16; j += 16)
{
uint8x16_t v_dst = vabdq_u8(vld1q_u8(a + j), vld1q_u8(b + j));
uint16x8_t v_low = vmovl_u8(vget_low_u8(v_dst)), v_high = vmovl_u8(vget_high_u8(v_dst));
v_sum = vaddq_u32(v_sum, vaddl_u16(vget_low_u16(v_low), vget_low_u16(v_high)));
v_sum = vaddq_u32(v_sum, vaddl_u16(vget_high_u16(v_low), vget_high_u16(v_high)));
}
uint CV_DECL_ALIGNED(16) buf[4];
vst1q_u32(buf, v_sum);
d = buf[0] + buf[1] + buf[2] + buf[3];
#endif
{
for( ; j <= n - 4; j += 4 )
{
d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
}
}
for( ; j < n; j++ )
d += std::abs(a[j] - b[j]);
return d;
}
}} //cv::hal

View File

@@ -86,6 +86,45 @@ Mutex* __initialization_mutex_initializer = &getInitializationMutex();
#undef max
#undef abs
#include <tchar.h>
#if defined _MSC_VER
#if _MSC_VER >= 1400
#include <intrin.h>
#elif defined _M_IX86
static void __cpuid(int* cpuid_data, int)
{
__asm
{
push ebx
push edi
mov edi, cpuid_data
mov eax, 1
cpuid
mov [edi], eax
mov [edi + 4], ebx
mov [edi + 8], ecx
mov [edi + 12], edx
pop edi
pop ebx
}
}
static void __cpuidex(int* cpuid_data, int, int)
{
__asm
{
push edi
mov edi, cpuid_data
mov eax, 7
mov ecx, 0
cpuid
mov [edi], eax
mov [edi + 4], ebx
mov [edi + 8], ecx
mov [edi + 12], edx
pop edi
}
}
#endif
#endif
#ifdef WINRT
#include <wrl/client.h>
@@ -198,15 +237,154 @@ void Exception::formatMessage()
msg = format("%s:%d: error: (%d) %s\n", file.c_str(), line, code, err.c_str());
}
struct HWFeatures
{
enum { MAX_FEATURE = CV_HARDWARE_MAX_FEATURE };
HWFeatures(void)
{
memset( have, 0, sizeof(have) );
x86_family = 0;
}
static HWFeatures initialize(void)
{
HWFeatures f;
int cpuid_data[4] = { 0, 0, 0, 0 };
#if defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
__cpuid(cpuid_data, 1);
#elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
#ifdef __x86_64__
asm __volatile__
(
"movl $1, %%eax\n\t"
"cpuid\n\t"
:[eax]"=a"(cpuid_data[0]),[ebx]"=b"(cpuid_data[1]),[ecx]"=c"(cpuid_data[2]),[edx]"=d"(cpuid_data[3])
:
: "cc"
);
#else
asm volatile
(
"pushl %%ebx\n\t"
"movl $1,%%eax\n\t"
"cpuid\n\t"
"popl %%ebx\n\t"
: "=a"(cpuid_data[0]), "=c"(cpuid_data[2]), "=d"(cpuid_data[3])
:
: "cc"
);
#endif
#endif
f.x86_family = (cpuid_data[0] >> 8) & 15;
if( f.x86_family >= 6 )
{
f.have[CV_CPU_MMX] = (cpuid_data[3] & (1 << 23)) != 0;
f.have[CV_CPU_SSE] = (cpuid_data[3] & (1<<25)) != 0;
f.have[CV_CPU_SSE2] = (cpuid_data[3] & (1<<26)) != 0;
f.have[CV_CPU_SSE3] = (cpuid_data[2] & (1<<0)) != 0;
f.have[CV_CPU_SSSE3] = (cpuid_data[2] & (1<<9)) != 0;
f.have[CV_CPU_FMA3] = (cpuid_data[2] & (1<<12)) != 0;
f.have[CV_CPU_SSE4_1] = (cpuid_data[2] & (1<<19)) != 0;
f.have[CV_CPU_SSE4_2] = (cpuid_data[2] & (1<<20)) != 0;
f.have[CV_CPU_POPCNT] = (cpuid_data[2] & (1<<23)) != 0;
f.have[CV_CPU_AVX] = (((cpuid_data[2] & (1<<28)) != 0)&&((cpuid_data[2] & (1<<27)) != 0));//OS uses XSAVE_XRSTORE and CPU support AVX
// make the second call to the cpuid command in order to get
// information about extended features like AVX2
#if defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
__cpuidex(cpuid_data, 7, 0);
#elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
#ifdef __x86_64__
asm __volatile__
(
"movl $7, %%eax\n\t"
"movl $0, %%ecx\n\t"
"cpuid\n\t"
:[eax]"=a"(cpuid_data[0]),[ebx]"=b"(cpuid_data[1]),[ecx]"=c"(cpuid_data[2]),[edx]"=d"(cpuid_data[3])
:
: "cc"
);
#else
asm volatile
(
"pushl %%ebx\n\t"
"movl $7,%%eax\n\t"
"movl $0,%%ecx\n\t"
"cpuid\n\t"
"movl %%ebx, %0\n\t"
"popl %%ebx\n\t"
: "=r"(cpuid_data[1]), "=c"(cpuid_data[2])
:
: "cc"
);
#endif
#endif
f.have[CV_CPU_AVX2] = (cpuid_data[1] & (1<<5)) != 0;
f.have[CV_CPU_AVX_512F] = (cpuid_data[1] & (1<<16)) != 0;
f.have[CV_CPU_AVX_512DQ] = (cpuid_data[1] & (1<<17)) != 0;
f.have[CV_CPU_AVX_512IFMA512] = (cpuid_data[1] & (1<<21)) != 0;
f.have[CV_CPU_AVX_512PF] = (cpuid_data[1] & (1<<26)) != 0;
f.have[CV_CPU_AVX_512ER] = (cpuid_data[1] & (1<<27)) != 0;
f.have[CV_CPU_AVX_512CD] = (cpuid_data[1] & (1<<28)) != 0;
f.have[CV_CPU_AVX_512BW] = (cpuid_data[1] & (1<<30)) != 0;
f.have[CV_CPU_AVX_512VL] = (cpuid_data[1] & (1<<31)) != 0;
f.have[CV_CPU_AVX_512VBMI] = (cpuid_data[2] & (1<<1)) != 0;
}
#if defined ANDROID || defined __linux__
#ifdef __aarch64__
f.have[CV_CPU_NEON] = true;
#else
int cpufile = open("/proc/self/auxv", O_RDONLY);
if (cpufile >= 0)
{
Elf32_auxv_t auxv;
const size_t size_auxv_t = sizeof(auxv);
while ((size_t)read(cpufile, &auxv, size_auxv_t) == size_auxv_t)
{
if (auxv.a_type == AT_HWCAP)
{
f.have[CV_CPU_NEON] = (auxv.a_un.a_val & 4096) != 0;
break;
}
}
close(cpufile);
}
#endif
#elif (defined __clang__ || defined __APPLE__) && (defined __ARM_NEON__ || (defined __ARM_NEON && defined __aarch64__))
f.have[CV_CPU_NEON] = true;
#endif
return f;
}
int x86_family;
bool have[MAX_FEATURE+1];
};
static HWFeatures featuresEnabled = HWFeatures::initialize(), featuresDisabled = HWFeatures();
static HWFeatures* currentFeatures = &featuresEnabled;
bool checkHardwareSupport(int feature)
{
CV_DbgAssert( 0 <= feature && feature <= CV_HARDWARE_MAX_FEATURE );
return cv::hal::checkHardwareSupport(feature);
return currentFeatures->have[feature];
}
volatile bool useOptimizedFlag = true;
void setUseOptimized( bool flag )
{
cv::hal::setUseOptimized(flag);
useOptimizedFlag = flag;
currentFeatures = flag ? &featuresEnabled : &featuresDisabled;
ipp::setUseIPP(flag);
#ifdef HAVE_OPENCL
@@ -219,7 +397,7 @@ void setUseOptimized( bool flag )
bool useOptimized(void)
{
return cv::hal::useOptimized();
return useOptimizedFlag;
}
int64 getTickCount(void)
@@ -499,12 +677,12 @@ redirectError( CvErrorCallback errCallback, void* userdata, void** prevUserdata)
CV_IMPL int cvCheckHardwareSupport(int feature)
{
CV_DbgAssert( 0 <= feature && feature <= CV_HARDWARE_MAX_FEATURE );
return cv::hal::checkHardwareSupport(feature);
return cv::currentFeatures->have[feature];
}
CV_IMPL int cvUseOptimized( int flag )
{
int prevMode = cv::useOptimized();
int prevMode = cv::useOptimizedFlag;
cv::setUseOptimized( flag != 0 );
return prevMode;
}

View File

@@ -40,7 +40,6 @@
//M*/
#include "test_precomp.hpp"
#include "opencv2/hal.hpp"
using namespace cv;
@@ -72,21 +71,21 @@ TEST(Core_HAL, mathfuncs)
{
case HAL_EXP:
if( depth == CV_32F )
hal::exp(src.ptr<float>(), dst.ptr<float>(), n);
hal::exp32f(src.ptr<float>(), dst.ptr<float>(), n);
else
hal::exp(src.ptr<double>(), dst.ptr<double>(), n);
hal::exp64f(src.ptr<double>(), dst.ptr<double>(), n);
break;
case HAL_LOG:
if( depth == CV_32F )
hal::log(src.ptr<float>(), dst.ptr<float>(), n);
hal::log32f(src.ptr<float>(), dst.ptr<float>(), n);
else
hal::log(src.ptr<double>(), dst.ptr<double>(), n);
hal::log64f(src.ptr<double>(), dst.ptr<double>(), n);
break;
case HAL_SQRT:
if( depth == CV_32F )
hal::sqrt(src.ptr<float>(), dst.ptr<float>(), n);
hal::sqrt32f(src.ptr<float>(), dst.ptr<float>(), n);
else
hal::sqrt(src.ptr<double>(), dst.ptr<double>(), n);
hal::sqrt64f(src.ptr<double>(), dst.ptr<double>(), n);
break;
default:
CV_Error(Error::StsBadArg, "unknown function");
@@ -159,15 +158,15 @@ TEST(Core_HAL, mat_decomp)
{
case HAL_LU:
if( depth == CV_32F )
hal::LU(a.ptr<float>(), a.step, size, x.ptr<float>(), x.step, 1);
hal::LU32f(a.ptr<float>(), a.step, size, x.ptr<float>(), x.step, 1);
else
hal::LU(a.ptr<double>(), a.step, size, x.ptr<double>(), x.step, 1);
hal::LU64f(a.ptr<double>(), a.step, size, x.ptr<double>(), x.step, 1);
break;
case HAL_CHOL:
if( depth == CV_32F )
hal::Cholesky(a.ptr<float>(), a.step, size, x.ptr<float>(), x.step, 1);
hal::Cholesky32f(a.ptr<float>(), a.step, size, x.ptr<float>(), x.step, 1);
else
hal::Cholesky(a.ptr<double>(), a.step, size, x.ptr<double>(), x.step, 1);
hal::Cholesky64f(a.ptr<double>(), a.step, size, x.ptr<double>(), x.step, 1);
break;
default:
CV_Error(Error::StsBadArg, "unknown function");

View File

@@ -0,0 +1,864 @@
#include "test_intrin_utils.hpp"
#include <climits>
using namespace cv;
template<typename R> struct TheTest
{
typedef typename R::lane_type LaneType;
TheTest & test_loadstore()
{
AlignedData<R> data;
AlignedData<R> out;
// check if addresses are aligned and unaligned respectively
EXPECT_EQ((size_t)0, (size_t)&data.a.d % 16);
EXPECT_NE((size_t)0, (size_t)&data.u.d % 16);
EXPECT_EQ((size_t)0, (size_t)&out.a.d % 16);
EXPECT_NE((size_t)0, (size_t)&out.u.d % 16);
// check some initialization methods
R r1 = data.a;
R r2 = v_load(data.u.d);
R r3 = v_load_aligned(data.a.d);
R r4(r2);
EXPECT_EQ(data.a[0], r1.get0());
EXPECT_EQ(data.u[0], r2.get0());
EXPECT_EQ(data.a[0], r3.get0());
EXPECT_EQ(data.u[0], r4.get0());
// check some store methods
out.u.clear();
out.a.clear();
v_store(out.u.d, r1);
v_store_aligned(out.a.d, r2);
EXPECT_EQ(data.a, out.a);
EXPECT_EQ(data.u, out.u);
// check more store methods
Data<R> d, res(0);
R r5 = d;
v_store_high(res.mid(), r5);
v_store_low(res.d, r5);
EXPECT_EQ(d, res);
// check halves load correctness
res.clear();
R r6 = v_load_halves(d.d, d.mid());
v_store(res.d, r6);
EXPECT_EQ(d, res);
// zero, all
Data<R> resZ = RegTrait<R>::zero();
Data<R> resV = RegTrait<R>::all(8);
for (int i = 0; i < R::nlanes; ++i)
{
EXPECT_EQ((LaneType)0, resZ[i]);
EXPECT_EQ((LaneType)8, resV[i]);
}
// reinterpret_as
v_uint8x16 vu8 = v_reinterpret_as_u8(r1); out.a.clear(); v_store((uchar*)out.a.d, vu8); EXPECT_EQ(data.a, out.a);
v_int8x16 vs8 = v_reinterpret_as_s8(r1); out.a.clear(); v_store((schar*)out.a.d, vs8); EXPECT_EQ(data.a, out.a);
v_uint16x8 vu16 = v_reinterpret_as_u16(r1); out.a.clear(); v_store((ushort*)out.a.d, vu16); EXPECT_EQ(data.a, out.a);
v_int16x8 vs16 = v_reinterpret_as_s16(r1); out.a.clear(); v_store((short*)out.a.d, vs16); EXPECT_EQ(data.a, out.a);
v_uint32x4 vu32 = v_reinterpret_as_u32(r1); out.a.clear(); v_store((unsigned*)out.a.d, vu32); EXPECT_EQ(data.a, out.a);
v_int32x4 vs32 = v_reinterpret_as_s32(r1); out.a.clear(); v_store((int*)out.a.d, vs32); EXPECT_EQ(data.a, out.a);
v_uint64x2 vu64 = v_reinterpret_as_u64(r1); out.a.clear(); v_store((uint64*)out.a.d, vu64); EXPECT_EQ(data.a, out.a);
v_int64x2 vs64 = v_reinterpret_as_s64(r1); out.a.clear(); v_store((int64*)out.a.d, vs64); EXPECT_EQ(data.a, out.a);
v_float32x4 vf32 = v_reinterpret_as_f32(r1); out.a.clear(); v_store((float*)out.a.d, vf32); EXPECT_EQ(data.a, out.a);
#if CV_SIMD128_64F
v_float64x2 vf64 = v_reinterpret_as_f64(r1); out.a.clear(); v_store((double*)out.a.d, vf64); EXPECT_EQ(data.a, out.a);
#endif
return *this;
}
TheTest & test_interleave()
{
Data<R> data1, data2, data3, data4;
data2 += 20;
data3 += 40;
data4 += 60;
R a = data1, b = data2, c = data3;
R d = data1, e = data2, f = data3, g = data4;
LaneType buf3[R::nlanes * 3];
LaneType buf4[R::nlanes * 4];
v_store_interleave(buf3, a, b, c);
v_store_interleave(buf4, d, e, f, g);
Data<R> z(0);
a = b = c = d = e = f = g = z;
v_load_deinterleave(buf3, a, b, c);
v_load_deinterleave(buf4, d, e, f, g);
for (int i = 0; i < R::nlanes; ++i)
{
EXPECT_EQ(data1, Data<R>(a));
EXPECT_EQ(data2, Data<R>(b));
EXPECT_EQ(data3, Data<R>(c));
EXPECT_EQ(data1, Data<R>(d));
EXPECT_EQ(data2, Data<R>(e));
EXPECT_EQ(data3, Data<R>(f));
EXPECT_EQ(data4, Data<R>(g));
}
return *this;
}
// v_expand and v_load_expand
TheTest & test_expand()
{
typedef typename RegTrait<R>::w_reg Rx2;
Data<R> dataA;
R a = dataA;
Data<Rx2> resB = v_load_expand(dataA.d);
Rx2 c, d;
v_expand(a, c, d);
Data<Rx2> resC = c, resD = d;
const int n = Rx2::nlanes;
for (int i = 0; i < n; ++i)
{
EXPECT_EQ(dataA[i], resB[i]);
EXPECT_EQ(dataA[i], resC[i]);
EXPECT_EQ(dataA[i + n], resD[i]);
}
return *this;
}
TheTest & test_expand_q()
{
typedef typename RegTrait<R>::q_reg Rx4;
Data<R> data;
Data<Rx4> out = v_load_expand_q(data.d);
const int n = Rx4::nlanes;
for (int i = 0; i < n; ++i)
EXPECT_EQ(data[i], out[i]);
return *this;
}
TheTest & test_addsub()
{
Data<R> dataA, dataB;
dataB.reverse();
R a = dataA, b = dataB;
Data<R> resC = a + b, resD = a - b;
for (int i = 0; i < R::nlanes; ++i)
{
EXPECT_EQ(saturate_cast<LaneType>(dataA[i] + dataB[i]), resC[i]);
EXPECT_EQ(saturate_cast<LaneType>(dataA[i] - dataB[i]), resD[i]);
}
return *this;
}
TheTest & test_addsub_wrap()
{
Data<R> dataA, dataB;
dataB.reverse();
R a = dataA, b = dataB;
Data<R> resC = v_add_wrap(a, b),
resD = v_sub_wrap(a, b);
for (int i = 0; i < R::nlanes; ++i)
{
EXPECT_EQ((LaneType)(dataA[i] + dataB[i]), resC[i]);
EXPECT_EQ((LaneType)(dataA[i] - dataB[i]), resD[i]);
}
return *this;
}
TheTest & test_mul()
{
Data<R> dataA, dataB;
dataB.reverse();
R a = dataA, b = dataB;
Data<R> resC = a * b;
for (int i = 0; i < R::nlanes; ++i)
{
EXPECT_EQ(dataA[i] * dataB[i], resC[i]);
}
return *this;
}
TheTest & test_div()
{
Data<R> dataA, dataB;
dataB.reverse();
R a = dataA, b = dataB;
Data<R> resC = a / b;
for (int i = 0; i < R::nlanes; ++i)
{
EXPECT_EQ(dataA[i] / dataB[i], resC[i]);
}
return *this;
}
TheTest & test_mul_expand()
{
typedef typename RegTrait<R>::w_reg Rx2;
Data<R> dataA, dataB(2);
R a = dataA, b = dataB;
Rx2 c, d;
v_mul_expand(a, b, c, d);
Data<Rx2> resC = c, resD = d;
const int n = R::nlanes / 2;
for (int i = 0; i < n; ++i)
{
EXPECT_EQ((typename Rx2::lane_type)dataA[i] * dataB[i], resC[i]);
EXPECT_EQ((typename Rx2::lane_type)dataA[i + n] * dataB[i + n], resD[i]);
}
return *this;
}
template <int s>
TheTest & test_shift()
{
Data<R> dataA;
R a = dataA;
Data<R> resB = a << s, resC = v_shl<s>(a), resD = a >> s, resE = v_shr<s>(a);
for (int i = 0; i < R::nlanes; ++i)
{
EXPECT_EQ(dataA[i] << s, resB[i]);
EXPECT_EQ(dataA[i] << s, resC[i]);
EXPECT_EQ(dataA[i] >> s, resD[i]);
EXPECT_EQ(dataA[i] >> s, resE[i]);
}
return *this;
}
TheTest & test_cmp()
{
Data<R> dataA, dataB;
dataB.reverse();
dataB += 1;
R a = dataA, b = dataB;
Data<R> resC = (a == b);
Data<R> resD = (a != b);
Data<R> resE = (a > b);
Data<R> resF = (a >= b);
Data<R> resG = (a < b);
Data<R> resH = (a <= b);
for (int i = 0; i < R::nlanes; ++i)
{
EXPECT_EQ(dataA[i] == dataB[i], resC[i] != 0);
EXPECT_EQ(dataA[i] != dataB[i], resD[i] != 0);
EXPECT_EQ(dataA[i] > dataB[i], resE[i] != 0);
EXPECT_EQ(dataA[i] >= dataB[i], resF[i] != 0);
EXPECT_EQ(dataA[i] < dataB[i], resG[i] != 0);
EXPECT_EQ(dataA[i] <= dataB[i], resH[i] != 0);
}
return *this;
}
TheTest & test_dot_prod()
{
typedef typename RegTrait<R>::w_reg Rx2;
Data<R> dataA, dataB(2);
R a = dataA, b = dataB;
Data<Rx2> res = v_dotprod(a, b);
const int n = R::nlanes / 2;
for (int i = 0; i < n; ++i)
{
EXPECT_EQ(dataA[i*2] * dataB[i*2] + dataA[i*2 + 1] * dataB[i*2 + 1], res[i]);
}
return *this;
}
TheTest & test_logic()
{
Data<R> dataA, dataB(2);
R a = dataA, b = dataB;
Data<R> resC = a & b, resD = a | b, resE = a ^ b, resF = ~a;
for (int i = 0; i < R::nlanes; ++i)
{
EXPECT_EQ(dataA[i] & dataB[i], resC[i]);
EXPECT_EQ(dataA[i] | dataB[i], resD[i]);
EXPECT_EQ(dataA[i] ^ dataB[i], resE[i]);
EXPECT_EQ((LaneType)~dataA[i], resF[i]);
}
return *this;
}
TheTest & test_sqrt_abs()
{
Data<R> dataA, dataD;
dataD *= -1.0;
R a = dataA, d = dataD;
Data<R> resB = v_sqrt(a), resC = v_invsqrt(a), resE = v_abs(d);
for (int i = 0; i < R::nlanes; ++i)
{
EXPECT_FLOAT_EQ((float)std::sqrt(dataA[i]), (float)resB[i]);
EXPECT_FLOAT_EQ(1/(float)std::sqrt(dataA[i]), (float)resC[i]);
EXPECT_FLOAT_EQ((float)abs(dataA[i]), (float)resE[i]);
}
return *this;
}
TheTest & test_min_max()
{
Data<R> dataA, dataB;
dataB.reverse();
R a = dataA, b = dataB;
Data<R> resC = v_min(a, b), resD = v_max(a, b);
for (int i = 0; i < R::nlanes; ++i)
{
EXPECT_EQ(std::min(dataA[i], dataB[i]), resC[i]);
EXPECT_EQ(std::max(dataA[i], dataB[i]), resD[i]);
}
return *this;
}
TheTest & test_absdiff()
{
typedef typename RegTrait<R>::u_reg Ru;
typedef typename Ru::lane_type u_type;
Data<R> dataA(std::numeric_limits<LaneType>::max()),
dataB(std::numeric_limits<LaneType>::min());
dataA[0] = (LaneType)-1;
dataB[0] = 1;
dataA[1] = 2;
dataB[1] = (LaneType)-2;
R a = dataA, b = dataB;
Data<Ru> resC = v_absdiff(a, b);
const u_type mask = std::numeric_limits<LaneType>::is_signed ? (u_type)(1 << (sizeof(u_type)*8 - 1)) : 0;
for (int i = 0; i < Ru::nlanes; ++i)
{
u_type uA = dataA[i] ^ mask;
u_type uB = dataB[i] ^ mask;
EXPECT_EQ(uA > uB ? uA - uB : uB - uA, resC[i]);
}
return *this;
}
TheTest & test_float_absdiff()
{
Data<R> dataA(std::numeric_limits<LaneType>::max()),
dataB(std::numeric_limits<LaneType>::min());
dataA[0] = -1;
dataB[0] = 1;
dataA[1] = 2;
dataB[1] = -2;
R a = dataA, b = dataB;
Data<R> resC = v_absdiff(a, b);
for (int i = 0; i < R::nlanes; ++i)
{
EXPECT_EQ(dataA[i] > dataB[i] ? dataA[i] - dataB[i] : dataB[i] - dataA[i], resC[i]);
}
return *this;
}
TheTest & test_reduce()
{
Data<R> dataA;
R a = dataA;
EXPECT_EQ((LaneType)1, v_reduce_min(a));
EXPECT_EQ((LaneType)R::nlanes, v_reduce_max(a));
EXPECT_EQ((LaneType)(1 + R::nlanes)*2, v_reduce_sum(a));
return *this;
}
TheTest & test_mask()
{
Data<R> dataA, dataB, dataC, dataD(1), dataE(2);
dataA[1] *= (LaneType)-1;
dataC *= (LaneType)-1;
R a = dataA, b = dataB, c = dataC, d = dataD, e = dataE;
int m = v_signmask(a);
EXPECT_EQ(2, m);
EXPECT_EQ(false, v_check_all(a));
EXPECT_EQ(false, v_check_all(b));
EXPECT_EQ(true, v_check_all(c));
EXPECT_EQ(true, v_check_any(a));
EXPECT_EQ(false, v_check_any(b));
EXPECT_EQ(true, v_check_any(c));
typedef V_TypeTraits<LaneType> Traits;
typedef typename Traits::int_type int_type;
R f = v_select(b, d, e);
Data<R> resF = f;
for (int i = 0; i < R::nlanes; ++i)
{
int_type m2 = Traits::reinterpret_int(dataB[i]);
EXPECT_EQ((Traits::reinterpret_int(dataD[i]) & m2)
| (Traits::reinterpret_int(dataE[i]) & ~m2),
Traits::reinterpret_int(resF[i]));
}
return *this;
}
template <int s>
TheTest & test_pack()
{
typedef typename RegTrait<R>::w_reg Rx2;
typedef typename Rx2::lane_type w_type;
Data<Rx2> dataA, dataB;
dataA += std::numeric_limits<LaneType>::is_signed ? -10 : 10;
dataB *= 10;
Rx2 a = dataA, b = dataB;
Data<R> resC = v_pack(a, b);
Data<R> resD = v_rshr_pack<s>(a, b);
Data<R> resE(0);
v_pack_store(resE.d, b);
Data<R> resF(0);
v_rshr_pack_store<s>(resF.d, b);
const int n = Rx2::nlanes;
const w_type add = (w_type)1 << (s - 1);
for (int i = 0; i < n; ++i)
{
EXPECT_EQ(saturate_cast<LaneType>(dataA[i]), resC[i]);
EXPECT_EQ(saturate_cast<LaneType>(dataB[i]), resC[i + n]);
EXPECT_EQ(saturate_cast<LaneType>((dataA[i] + add) >> s), resD[i]);
EXPECT_EQ(saturate_cast<LaneType>((dataB[i] + add) >> s), resD[i + n]);
EXPECT_EQ(saturate_cast<LaneType>(dataB[i]), resE[i]);
EXPECT_EQ((LaneType)0, resE[i + n]);
EXPECT_EQ(saturate_cast<LaneType>((dataB[i] + add) >> s), resF[i]);
EXPECT_EQ((LaneType)0, resF[i + n]);
}
return *this;
}
template <int s>
TheTest & test_pack_u()
{
typedef typename RegTrait<R>::w_reg Rx2;
typedef typename RegTrait<Rx2>::int_reg Ri2;
typedef typename Ri2::lane_type w_type;
Data<Ri2> dataA, dataB;
dataA += -10;
dataB *= 10;
Ri2 a = dataA, b = dataB;
Data<R> resC = v_pack_u(a, b);
Data<R> resD = v_rshr_pack_u<s>(a, b);
Data<R> resE(0);
v_pack_u_store(resE.d, b);
Data<R> resF(0);
v_rshr_pack_u_store<s>(resF.d, b);
const int n = Ri2::nlanes;
const w_type add = (w_type)1 << (s - 1);
for (int i = 0; i < n; ++i)
{
EXPECT_EQ(saturate_cast<LaneType>(dataA[i]), resC[i]);
EXPECT_EQ(saturate_cast<LaneType>(dataB[i]), resC[i + n]);
EXPECT_EQ(saturate_cast<LaneType>((dataA[i] + add) >> s), resD[i]);
EXPECT_EQ(saturate_cast<LaneType>((dataB[i] + add) >> s), resD[i + n]);
EXPECT_EQ(saturate_cast<LaneType>(dataB[i]), resE[i]);
EXPECT_EQ((LaneType)0, resE[i + n]);
EXPECT_EQ(saturate_cast<LaneType>((dataB[i] + add) >> s), resF[i]);
EXPECT_EQ((LaneType)0, resF[i + n]);
}
return *this;
}
TheTest & test_unpack()
{
Data<R> dataA, dataB;
dataB *= 10;
R a = dataA, b = dataB;
R c, d, e, f, lo, hi;
v_zip(a, b, c, d);
v_recombine(a, b, e, f);
lo = v_combine_low(a, b);
hi = v_combine_high(a, b);
Data<R> resC = c, resD = d, resE = e, resF = f, resLo = lo, resHi = hi;
const int n = R::nlanes/2;
for (int i = 0; i < n; ++i)
{
EXPECT_EQ(dataA[i], resC[i*2]);
EXPECT_EQ(dataB[i], resC[i*2+1]);
EXPECT_EQ(dataA[i+n], resD[i*2]);
EXPECT_EQ(dataB[i+n], resD[i*2+1]);
EXPECT_EQ(dataA[i], resE[i]);
EXPECT_EQ(dataB[i], resE[i+n]);
EXPECT_EQ(dataA[i+n], resF[i]);
EXPECT_EQ(dataB[i+n], resF[i+n]);
EXPECT_EQ(dataA[i], resLo[i]);
EXPECT_EQ(dataB[i], resLo[i+n]);
EXPECT_EQ(dataA[i+n], resHi[i]);
EXPECT_EQ(dataB[i+n], resHi[i+n]);
}
return *this;
}
template<int s>
TheTest & test_extract()
{
Data<R> dataA, dataB;
dataB *= 10;
R a = dataA, b = dataB;
Data<R> resC = v_extract<s>(a, b);
for (int i = 0; i < R::nlanes; ++i)
{
if (i + s >= R::nlanes)
EXPECT_EQ(dataB[i - R::nlanes + s], resC[i]);
else
EXPECT_EQ(dataA[i + s], resC[i]);
}
return *this;
}
TheTest & test_float_math()
{
typedef typename RegTrait<R>::int_reg Ri;
Data<R> data1, data2, data3;
data1 *= 1.1;
data2 += 10;
R a1 = data1, a2 = data2, a3 = data3;
Data<Ri> resB = v_round(a1),
resC = v_trunc(a1),
resD = v_floor(a1),
resE = v_ceil(a1);
Data<R> resF = v_magnitude(a1, a2),
resG = v_sqr_magnitude(a1, a2),
resH = v_muladd(a1, a2, a3);
for (int i = 0; i < R::nlanes; ++i)
{
EXPECT_EQ(cvRound(data1[i]), resB[i]);
EXPECT_EQ((typename Ri::lane_type)data1[i], resC[i]);
EXPECT_EQ(cvFloor(data1[i]), resD[i]);
EXPECT_EQ(cvCeil(data1[i]), resE[i]);
EXPECT_DOUBLE_EQ(std::sqrt(data1[i]*data1[i] + data2[i]*data2[i]), resF[i]);
EXPECT_DOUBLE_EQ(data1[i]*data1[i] + data2[i]*data2[i], resG[i]);
EXPECT_DOUBLE_EQ(data1[i]*data2[i] + data3[i], resH[i]);
}
return *this;
}
TheTest & test_float_cvt32()
{
typedef v_float32x4 Rt;
Data<R> dataA;
dataA *= 1.1;
R a = dataA;
Rt b = v_cvt_f32(a);
Data<Rt> resB = b;
int n = std::min<int>(Rt::nlanes, R::nlanes);
for (int i = 0; i < n; ++i)
{
EXPECT_EQ((typename Rt::lane_type)dataA[i], resB[i]);
}
return *this;
}
TheTest & test_float_cvt64()
{
#if CV_SIMD128_64F
typedef v_float64x2 Rt;
Data<R> dataA;
dataA *= 1.1;
R a = dataA;
Rt b = v_cvt_f64(a);
Data<Rt> resB = b;
int n = std::min<int>(Rt::nlanes, R::nlanes);
for (int i = 0; i < n; ++i)
{
EXPECT_EQ((typename Rt::lane_type)dataA[i], resB[i]);
}
#endif
return *this;
}
TheTest & test_matmul()
{
Data<R> dataV, dataA, dataB, dataC, dataD;
dataB.reverse();
dataC += 2;
dataD *= 0.3;
R v = dataV, a = dataA, b = dataB, c = dataC, d = dataD;
Data<R> res = v_matmul(v, a, b, c, d);
for (int i = 0; i < R::nlanes; ++i)
{
LaneType val = dataV[0] * dataA[i]
+ dataV[1] * dataB[i]
+ dataV[2] * dataC[i]
+ dataV[3] * dataD[i];
EXPECT_DOUBLE_EQ(val, res[i]);
}
return *this;
}
TheTest & test_transpose()
{
Data<R> dataA, dataB, dataC, dataD;
dataB *= 5;
dataC *= 10;
dataD *= 15;
R a = dataA, b = dataB, c = dataC, d = dataD;
R e, f, g, h;
v_transpose4x4(a, b, c, d,
e, f, g, h);
Data<R> res[4] = {e, f, g, h};
for (int i = 0; i < R::nlanes; ++i)
{
EXPECT_EQ(dataA[i], res[i][0]);
EXPECT_EQ(dataB[i], res[i][1]);
EXPECT_EQ(dataC[i], res[i][2]);
EXPECT_EQ(dataD[i], res[i][3]);
}
return *this;
}
};
//============= 8-bit integer =====================================================================
TEST(hal_intrin, uint8x16) {
TheTest<v_uint8x16>()
.test_loadstore()
.test_interleave()
.test_expand()
.test_expand_q()
.test_addsub()
.test_addsub_wrap()
.test_cmp()
.test_logic()
.test_min_max()
.test_absdiff()
.test_mask()
.test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>()
.test_pack_u<1>().test_pack_u<2>().test_pack_u<3>().test_pack_u<8>()
.test_unpack()
.test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>()
;
}
TEST(hal_intrin, int8x16) {
TheTest<v_int8x16>()
.test_loadstore()
.test_interleave()
.test_expand()
.test_expand_q()
.test_addsub()
.test_addsub_wrap()
.test_cmp()
.test_logic()
.test_min_max()
.test_absdiff()
.test_mask()
.test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>()
.test_unpack()
.test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>()
;
}
//============= 16-bit integer =====================================================================
TEST(hal_intrin, uint16x8) {
TheTest<v_uint16x8>()
.test_loadstore()
.test_interleave()
.test_expand()
.test_addsub()
.test_addsub_wrap()
.test_mul()
.test_mul_expand()
.test_cmp()
.test_shift<1>()
.test_shift<8>()
.test_logic()
.test_min_max()
.test_absdiff()
.test_mask()
.test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>()
.test_pack_u<1>().test_pack_u<2>().test_pack_u<7>().test_pack_u<16>()
.test_unpack()
.test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>()
;
}
TEST(hal_intrin, int16x8) {
TheTest<v_int16x8>()
.test_loadstore()
.test_interleave()
.test_expand()
.test_addsub()
.test_addsub_wrap()
.test_mul()
.test_mul_expand()
.test_cmp()
.test_shift<1>()
.test_shift<8>()
.test_dot_prod()
.test_logic()
.test_min_max()
.test_absdiff()
.test_mask()
.test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>()
.test_unpack()
.test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>()
;
}
//============= 32-bit integer =====================================================================
TEST(hal_intrin, uint32x4) {
TheTest<v_uint32x4>()
.test_loadstore()
.test_interleave()
.test_expand()
.test_addsub()
.test_mul()
.test_mul_expand()
.test_cmp()
.test_shift<1>()
.test_shift<8>()
.test_logic()
.test_min_max()
.test_absdiff()
.test_reduce()
.test_mask()
.test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>()
.test_unpack()
.test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
.test_transpose()
;
}
TEST(hal_intrin, int32x4) {
TheTest<v_int32x4>()
.test_loadstore()
.test_interleave()
.test_expand()
.test_addsub()
.test_mul()
.test_cmp()
.test_shift<1>().test_shift<8>()
.test_logic()
.test_min_max()
.test_absdiff()
.test_reduce()
.test_mask()
.test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>()
.test_unpack()
.test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
.test_float_cvt32()
.test_float_cvt64()
.test_transpose()
;
}
//============= 64-bit integer =====================================================================
TEST(hal_intrin, uint64x2) {
TheTest<v_uint64x2>()
.test_loadstore()
.test_addsub()
.test_shift<1>().test_shift<8>()
.test_logic()
.test_extract<0>().test_extract<1>()
;
}
TEST(hal_intrin, int64x2) {
TheTest<v_int64x2>()
.test_loadstore()
.test_addsub()
.test_shift<1>().test_shift<8>()
.test_logic()
.test_extract<0>().test_extract<1>()
;
}
//============= Floating point =====================================================================
TEST(hal_intrin, float32x4) {
TheTest<v_float32x4>()
.test_loadstore()
.test_interleave()
.test_addsub()
.test_mul()
.test_div()
.test_cmp()
.test_sqrt_abs()
.test_min_max()
.test_float_absdiff()
.test_reduce()
.test_mask()
.test_unpack()
.test_float_math()
.test_float_cvt64()
.test_matmul()
.test_transpose()
;
}
#if CV_SIMD128_64F
TEST(hal_intrin, float64x2) {
TheTest<v_float64x2>()
.test_loadstore()
.test_addsub()
.test_mul()
.test_div()
.test_cmp()
.test_sqrt_abs()
.test_min_max()
.test_float_absdiff()
.test_mask()
.test_unpack()
.test_float_math()
.test_float_cvt32()
;
}
#endif

View File

@@ -0,0 +1,234 @@
#ifndef _TEST_UTILS_HPP_
#define _TEST_UTILS_HPP_
#include "opencv2/core/hal/intrin.hpp"
#include "opencv2/ts.hpp"
#include <ostream>
#include <algorithm>
template <typename R> struct Data;
template <int N> struct initializer;
template <> struct initializer<16>
{
template <typename R> static R init(const Data<R> & d)
{
return R(d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7], d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15]);
}
};
template <> struct initializer<8>
{
template <typename R> static R init(const Data<R> & d)
{
return R(d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7]);
}
};
template <> struct initializer<4>
{
template <typename R> static R init(const Data<R> & d)
{
return R(d[0], d[1], d[2], d[3]);
}
};
template <> struct initializer<2>
{
template <typename R> static R init(const Data<R> & d)
{
return R(d[0], d[1]);
}
};
//==================================================================================================
template <typename R> struct Data
{
typedef typename R::lane_type LaneType;
Data()
{
for (int i = 0; i < R::nlanes; ++i)
d[i] = (LaneType)(i + 1);
}
Data(LaneType val)
{
fill(val);
}
Data(const R & r)
{
*this = r;
}
operator R ()
{
return initializer<R::nlanes>().init(*this);
}
Data<R> & operator=(const R & r)
{
v_store(d, r);
return *this;
}
template <typename T> Data<R> & operator*=(T m)
{
for (int i = 0; i < R::nlanes; ++i)
d[i] *= (LaneType)m;
return *this;
}
template <typename T> Data<R> & operator+=(T m)
{
for (int i = 0; i < R::nlanes; ++i)
d[i] += (LaneType)m;
return *this;
}
void fill(LaneType val)
{
for (int i = 0; i < R::nlanes; ++i)
d[i] = val;
}
void reverse()
{
for (int i = 0; i < R::nlanes / 2; ++i)
std::swap(d[i], d[R::nlanes - i - 1]);
}
const LaneType & operator[](int i) const
{
CV_Assert(i >= 0 && i < R::nlanes);
return d[i];
}
LaneType & operator[](int i)
{
CV_Assert(i >= 0 && i < R::nlanes);
return d[i];
}
const LaneType * mid() const
{
return d + R::nlanes / 2;
}
LaneType * mid()
{
return d + R::nlanes / 2;
}
bool operator==(const Data<R> & other) const
{
for (int i = 0; i < R::nlanes; ++i)
if (d[i] != other.d[i])
return false;
return true;
}
void clear()
{
fill(0);
}
bool isZero() const
{
return isValue(0);
}
bool isValue(uchar val) const
{
for (int i = 0; i < R::nlanes; ++i)
if (d[i] != val)
return false;
return true;
}
LaneType d[R::nlanes];
};
template<typename R> struct AlignedData
{
Data<R> CV_DECL_ALIGNED(16) a; // aligned
char dummy;
Data<R> u; // unaligned
};
template <typename R> std::ostream & operator<<(std::ostream & out, const Data<R> & d)
{
out << "{ ";
for (int i = 0; i < R::nlanes; ++i)
{
// out << std::hex << +V_TypeTraits<typename R::lane_type>::reinterpret_int(d.d[i]);
out << +d.d[i];
if (i + 1 < R::nlanes)
out << ", ";
}
out << " }";
return out;
}
//==================================================================================================
template <typename R> struct RegTrait;
template <> struct RegTrait<cv::v_uint8x16> {
typedef cv::v_uint16x8 w_reg;
typedef cv::v_uint32x4 q_reg;
typedef cv::v_uint8x16 u_reg;
static cv::v_uint8x16 zero() { return cv::v_setzero_u8(); }
static cv::v_uint8x16 all(uchar val) { return cv::v_setall_u8(val); }
};
template <> struct RegTrait<cv::v_int8x16> {
typedef cv::v_int16x8 w_reg;
typedef cv::v_int32x4 q_reg;
typedef cv::v_uint8x16 u_reg;
static cv::v_int8x16 zero() { return cv::v_setzero_s8(); }
static cv::v_int8x16 all(schar val) { return cv::v_setall_s8(val); }
};
template <> struct RegTrait<cv::v_uint16x8> {
typedef cv::v_uint32x4 w_reg;
typedef cv::v_int16x8 int_reg;
typedef cv::v_uint16x8 u_reg;
static cv::v_uint16x8 zero() { return cv::v_setzero_u16(); }
static cv::v_uint16x8 all(ushort val) { return cv::v_setall_u16(val); }
};
template <> struct RegTrait<cv::v_int16x8> {
typedef cv::v_int32x4 w_reg;
typedef cv::v_uint16x8 u_reg;
static cv::v_int16x8 zero() { return cv::v_setzero_s16(); }
static cv::v_int16x8 all(short val) { return cv::v_setall_s16(val); }
};
template <> struct RegTrait<cv::v_uint32x4> {
typedef cv::v_uint64x2 w_reg;
typedef cv::v_int32x4 int_reg;
typedef cv::v_uint32x4 u_reg;
static cv::v_uint32x4 zero() { return cv::v_setzero_u32(); }
static cv::v_uint32x4 all(unsigned val) { return cv::v_setall_u32(val); }
};
template <> struct RegTrait<cv::v_int32x4> {
typedef cv::v_int64x2 w_reg;
typedef cv::v_uint32x4 u_reg;
static cv::v_int32x4 zero() { return cv::v_setzero_s32(); }
static cv::v_int32x4 all(int val) { return cv::v_setall_s32(val); }
};
template <> struct RegTrait<cv::v_uint64x2> {
static cv::v_uint64x2 zero() { return cv::v_setzero_u64(); }
static cv::v_uint64x2 all(uint64 val) { return cv::v_setall_u64(val); }
};
template <> struct RegTrait<cv::v_int64x2> {
static cv::v_int64x2 zero() { return cv::v_setzero_s64(); }
static cv::v_int64x2 all(int64 val) { return cv::v_setall_s64(val); }
};
template <> struct RegTrait<cv::v_float32x4> {
typedef cv::v_int32x4 int_reg;
typedef cv::v_float32x4 u_reg;
static cv::v_float32x4 zero() { return cv::v_setzero_f32(); }
static cv::v_float32x4 all(float val) { return cv::v_setall_f32(val); }
};
#if CV_SIMD128_64F
template <> struct RegTrait<cv::v_float64x2> {
typedef cv::v_int32x4 int_reg;
typedef cv::v_float64x2 u_reg;
static cv::v_float64x2 zero() { return cv::v_setzero_f64(); }
static cv::v_float64x2 all(double val) { return cv::v_setall_f64(val); }
};
#endif
#endif

View File

@@ -13,6 +13,9 @@
#include "opencv2/ts.hpp"
#include "opencv2/core/core_c.h"
#include "opencv2/core/cvdef.h"
#include "opencv2/core/private.hpp"
#include "opencv2/core/hal/hal.hpp"
#include "opencv2/core/hal/intrin.hpp"
#endif