Merge pull request #5810 from mshabunin:hal_interface
This commit is contained in:
@@ -1,6 +1,5 @@
|
||||
set(the_description "The Core Functionality")
|
||||
ocv_add_module(core
|
||||
opencv_hal
|
||||
PRIVATE_REQUIRED ${ZLIB_LIBRARIES} "${OPENCL_LIBRARIES}" "${VA_LIBRARIES}"
|
||||
OPTIONAL opencv_cudev
|
||||
WRAP java python)
|
||||
|
@@ -72,6 +72,7 @@
|
||||
@defgroup core_cluster Clustering
|
||||
@defgroup core_utils Utility and system functions and macros
|
||||
@{
|
||||
@defgroup core_utils_sse SSE utilities
|
||||
@defgroup core_utils_neon NEON utilities
|
||||
@}
|
||||
@defgroup core_opengl OpenGL interoperability
|
||||
@@ -80,6 +81,16 @@
|
||||
@defgroup core_directx DirectX interoperability
|
||||
@defgroup core_eigen Eigen support
|
||||
@defgroup core_opencl OpenCL support
|
||||
@defgroup core_va_intel Intel VA-API/OpenCL (CL-VA) interoperability
|
||||
@defgroup core_hal Hardware Acceleration Layer
|
||||
@{
|
||||
@defgroup core_hal_functions Functions
|
||||
@defgroup core_hal_interface Interface
|
||||
@defgroup core_hal_intrin Universal intrinsics
|
||||
@{
|
||||
@defgroup core_hal_intrin_impl Private implementation helpers
|
||||
@}
|
||||
@}
|
||||
@}
|
||||
*/
|
||||
|
||||
|
@@ -50,10 +50,10 @@
|
||||
#endif
|
||||
|
||||
#include <climits>
|
||||
#include <algorithm>
|
||||
|
||||
#include "opencv2/core/cvdef.h"
|
||||
#include "opencv2/core/cvstd.hpp"
|
||||
#include "opencv2/hal.hpp"
|
||||
|
||||
namespace cv
|
||||
{
|
||||
@@ -679,8 +679,11 @@ CV_EXPORTS void setUseIPP(bool flag);
|
||||
|
||||
//! @} core_utils
|
||||
|
||||
|
||||
|
||||
|
||||
} // cv
|
||||
|
||||
#include "opencv2/hal/neon_utils.hpp"
|
||||
#include "opencv2/core/neon_utils.hpp"
|
||||
|
||||
#endif //__OPENCV_CORE_BASE_HPP__
|
||||
|
@@ -45,6 +45,9 @@
|
||||
#ifndef __OPENCV_CORE_CVDEF_H__
|
||||
#define __OPENCV_CORE_CVDEF_H__
|
||||
|
||||
//! @addtogroup core_utils
|
||||
//! @{
|
||||
|
||||
#if !defined _CRT_SECURE_NO_DEPRECATE && defined _MSC_VER && _MSC_VER > 1300
|
||||
# define _CRT_SECURE_NO_DEPRECATE /* to avoid multiple Visual Studio warnings */
|
||||
#endif
|
||||
@@ -56,7 +59,265 @@
|
||||
#undef abs
|
||||
#undef Complex
|
||||
|
||||
#include "opencv2/hal/defs.h"
|
||||
#if !defined _CRT_SECURE_NO_DEPRECATE && defined _MSC_VER && _MSC_VER > 1300
|
||||
# define _CRT_SECURE_NO_DEPRECATE /* to avoid multiple Visual Studio warnings */
|
||||
#endif
|
||||
|
||||
#include <limits.h>
|
||||
#include "opencv2/core/hal/interface.h"
|
||||
|
||||
#if defined __ICL
|
||||
# define CV_ICC __ICL
|
||||
#elif defined __ICC
|
||||
# define CV_ICC __ICC
|
||||
#elif defined __ECL
|
||||
# define CV_ICC __ECL
|
||||
#elif defined __ECC
|
||||
# define CV_ICC __ECC
|
||||
#elif defined __INTEL_COMPILER
|
||||
# define CV_ICC __INTEL_COMPILER
|
||||
#endif
|
||||
|
||||
#ifndef CV_INLINE
|
||||
# if defined __cplusplus
|
||||
# define CV_INLINE static inline
|
||||
# elif defined _MSC_VER
|
||||
# define CV_INLINE __inline
|
||||
# else
|
||||
# define CV_INLINE static
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#if defined CV_ICC && !defined CV_ENABLE_UNROLLED
|
||||
# define CV_ENABLE_UNROLLED 0
|
||||
#else
|
||||
# define CV_ENABLE_UNROLLED 1
|
||||
#endif
|
||||
|
||||
#ifdef __GNUC__
|
||||
# define CV_DECL_ALIGNED(x) __attribute__ ((aligned (x)))
|
||||
#elif defined _MSC_VER
|
||||
# define CV_DECL_ALIGNED(x) __declspec(align(x))
|
||||
#else
|
||||
# define CV_DECL_ALIGNED(x)
|
||||
#endif
|
||||
|
||||
/* CPU features and intrinsics support */
|
||||
#define CV_CPU_NONE 0
|
||||
#define CV_CPU_MMX 1
|
||||
#define CV_CPU_SSE 2
|
||||
#define CV_CPU_SSE2 3
|
||||
#define CV_CPU_SSE3 4
|
||||
#define CV_CPU_SSSE3 5
|
||||
#define CV_CPU_SSE4_1 6
|
||||
#define CV_CPU_SSE4_2 7
|
||||
#define CV_CPU_POPCNT 8
|
||||
|
||||
#define CV_CPU_AVX 10
|
||||
#define CV_CPU_AVX2 11
|
||||
#define CV_CPU_FMA3 12
|
||||
|
||||
#define CV_CPU_AVX_512F 13
|
||||
#define CV_CPU_AVX_512BW 14
|
||||
#define CV_CPU_AVX_512CD 15
|
||||
#define CV_CPU_AVX_512DQ 16
|
||||
#define CV_CPU_AVX_512ER 17
|
||||
#define CV_CPU_AVX_512IFMA512 18
|
||||
#define CV_CPU_AVX_512PF 19
|
||||
#define CV_CPU_AVX_512VBMI 20
|
||||
#define CV_CPU_AVX_512VL 21
|
||||
|
||||
#define CV_CPU_NEON 100
|
||||
|
||||
// when adding to this list remember to update the following enum
|
||||
#define CV_HARDWARE_MAX_FEATURE 255
|
||||
|
||||
/** @brief Available CPU features.
|
||||
*/
|
||||
enum CpuFeatures {
|
||||
CPU_MMX = 1,
|
||||
CPU_SSE = 2,
|
||||
CPU_SSE2 = 3,
|
||||
CPU_SSE3 = 4,
|
||||
CPU_SSSE3 = 5,
|
||||
CPU_SSE4_1 = 6,
|
||||
CPU_SSE4_2 = 7,
|
||||
CPU_POPCNT = 8,
|
||||
|
||||
CPU_AVX = 10,
|
||||
CPU_AVX2 = 11,
|
||||
CPU_FMA3 = 12,
|
||||
|
||||
CPU_AVX_512F = 13,
|
||||
CPU_AVX_512BW = 14,
|
||||
CPU_AVX_512CD = 15,
|
||||
CPU_AVX_512DQ = 16,
|
||||
CPU_AVX_512ER = 17,
|
||||
CPU_AVX_512IFMA512 = 18,
|
||||
CPU_AVX_512PF = 19,
|
||||
CPU_AVX_512VBMI = 20,
|
||||
CPU_AVX_512VL = 21,
|
||||
|
||||
CPU_NEON = 100
|
||||
};
|
||||
|
||||
// do not include SSE/AVX/NEON headers for NVCC compiler
|
||||
#ifndef __CUDACC__
|
||||
|
||||
#if defined __SSE2__ || defined _M_X64 || (defined _M_IX86_FP && _M_IX86_FP >= 2)
|
||||
# include <emmintrin.h>
|
||||
# define CV_MMX 1
|
||||
# define CV_SSE 1
|
||||
# define CV_SSE2 1
|
||||
# if defined __SSE3__ || (defined _MSC_VER && _MSC_VER >= 1500)
|
||||
# include <pmmintrin.h>
|
||||
# define CV_SSE3 1
|
||||
# endif
|
||||
# if defined __SSSE3__ || (defined _MSC_VER && _MSC_VER >= 1500)
|
||||
# include <tmmintrin.h>
|
||||
# define CV_SSSE3 1
|
||||
# endif
|
||||
# if defined __SSE4_1__ || (defined _MSC_VER && _MSC_VER >= 1500)
|
||||
# include <smmintrin.h>
|
||||
# define CV_SSE4_1 1
|
||||
# endif
|
||||
# if defined __SSE4_2__ || (defined _MSC_VER && _MSC_VER >= 1500)
|
||||
# include <nmmintrin.h>
|
||||
# define CV_SSE4_2 1
|
||||
# endif
|
||||
# if defined __POPCNT__ || (defined _MSC_VER && _MSC_VER >= 1500)
|
||||
# ifdef _MSC_VER
|
||||
# include <nmmintrin.h>
|
||||
# else
|
||||
# include <popcntintrin.h>
|
||||
# endif
|
||||
# define CV_POPCNT 1
|
||||
# endif
|
||||
# if defined __AVX__ || (defined _MSC_VER && _MSC_VER >= 1600 && 0)
|
||||
// MS Visual Studio 2010 (2012?) has no macro pre-defined to identify the use of /arch:AVX
|
||||
// See: http://connect.microsoft.com/VisualStudio/feedback/details/605858/arch-avx-should-define-a-predefined-macro-in-x64-and-set-a-unique-value-for-m-ix86-fp-in-win32
|
||||
# include <immintrin.h>
|
||||
# define CV_AVX 1
|
||||
# if defined(_XCR_XFEATURE_ENABLED_MASK)
|
||||
# define __xgetbv() _xgetbv(_XCR_XFEATURE_ENABLED_MASK)
|
||||
# else
|
||||
# define __xgetbv() 0
|
||||
# endif
|
||||
# endif
|
||||
# if defined __AVX2__ || (defined _MSC_VER && _MSC_VER >= 1800 && 0)
|
||||
# include <immintrin.h>
|
||||
# define CV_AVX2 1
|
||||
# if defined __FMA__
|
||||
# define CV_FMA3 1
|
||||
# endif
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#if (defined WIN32 || defined _WIN32) && defined(_M_ARM)
|
||||
# include <Intrin.h>
|
||||
# include "arm_neon.h"
|
||||
# define CV_NEON 1
|
||||
# define CPU_HAS_NEON_FEATURE (true)
|
||||
#elif defined(__ARM_NEON__) || (defined (__ARM_NEON) && defined(__aarch64__))
|
||||
# include <arm_neon.h>
|
||||
# define CV_NEON 1
|
||||
#endif
|
||||
|
||||
#if defined __GNUC__ && defined __arm__ && (defined __ARM_PCS_VFP || defined __ARM_VFPV3__ || defined __ARM_NEON__) && !defined __SOFTFP__
|
||||
# define CV_VFP 1
|
||||
#endif
|
||||
|
||||
#endif // __CUDACC__
|
||||
|
||||
#ifndef CV_POPCNT
|
||||
#define CV_POPCNT 0
|
||||
#endif
|
||||
#ifndef CV_MMX
|
||||
# define CV_MMX 0
|
||||
#endif
|
||||
#ifndef CV_SSE
|
||||
# define CV_SSE 0
|
||||
#endif
|
||||
#ifndef CV_SSE2
|
||||
# define CV_SSE2 0
|
||||
#endif
|
||||
#ifndef CV_SSE3
|
||||
# define CV_SSE3 0
|
||||
#endif
|
||||
#ifndef CV_SSSE3
|
||||
# define CV_SSSE3 0
|
||||
#endif
|
||||
#ifndef CV_SSE4_1
|
||||
# define CV_SSE4_1 0
|
||||
#endif
|
||||
#ifndef CV_SSE4_2
|
||||
# define CV_SSE4_2 0
|
||||
#endif
|
||||
#ifndef CV_AVX
|
||||
# define CV_AVX 0
|
||||
#endif
|
||||
#ifndef CV_AVX2
|
||||
# define CV_AVX2 0
|
||||
#endif
|
||||
#ifndef CV_FMA3
|
||||
# define CV_FMA3 0
|
||||
#endif
|
||||
#ifndef CV_AVX_512F
|
||||
# define CV_AVX_512F 0
|
||||
#endif
|
||||
#ifndef CV_AVX_512BW
|
||||
# define CV_AVX_512BW 0
|
||||
#endif
|
||||
#ifndef CV_AVX_512CD
|
||||
# define CV_AVX_512CD 0
|
||||
#endif
|
||||
#ifndef CV_AVX_512DQ
|
||||
# define CV_AVX_512DQ 0
|
||||
#endif
|
||||
#ifndef CV_AVX_512ER
|
||||
# define CV_AVX_512ER 0
|
||||
#endif
|
||||
#ifndef CV_AVX_512IFMA512
|
||||
# define CV_AVX_512IFMA512 0
|
||||
#endif
|
||||
#ifndef CV_AVX_512PF
|
||||
# define CV_AVX_512PF 0
|
||||
#endif
|
||||
#ifndef CV_AVX_512VBMI
|
||||
# define CV_AVX_512VBMI 0
|
||||
#endif
|
||||
#ifndef CV_AVX_512VL
|
||||
# define CV_AVX_512VL 0
|
||||
#endif
|
||||
|
||||
#ifndef CV_NEON
|
||||
# define CV_NEON 0
|
||||
#endif
|
||||
|
||||
#ifndef CV_VFP
|
||||
# define CV_VFP 0
|
||||
#endif
|
||||
|
||||
/* fundamental constants */
|
||||
#define CV_PI 3.1415926535897932384626433832795
|
||||
#define CV_2PI 6.283185307179586476925286766559
|
||||
#define CV_LOG2 0.69314718055994530941723212145818
|
||||
|
||||
typedef union Cv32suf
|
||||
{
|
||||
int i;
|
||||
unsigned u;
|
||||
float f;
|
||||
}
|
||||
Cv32suf;
|
||||
|
||||
typedef union Cv64suf
|
||||
{
|
||||
int64 i;
|
||||
uint64 u;
|
||||
double f;
|
||||
}
|
||||
Cv64suf;
|
||||
|
||||
#define OPENCV_ABI_COMPATIBILITY 300
|
||||
|
||||
@@ -169,12 +430,12 @@
|
||||
#define CV_SUBMAT_FLAG (1 << CV_SUBMAT_FLAG_SHIFT)
|
||||
#define CV_IS_SUBMAT(flags) ((flags) & CV_MAT_SUBMAT_FLAG)
|
||||
|
||||
/* Size of each channel item,
|
||||
/** Size of each channel item,
|
||||
0x124489 = 1000 0100 0100 0010 0010 0001 0001 ~ array of sizeof(arr_type_elem) */
|
||||
#define CV_ELEM_SIZE1(type) \
|
||||
((((sizeof(size_t)<<28)|0x8442211) >> CV_MAT_DEPTH(type)*4) & 15)
|
||||
|
||||
/* 0x3a50 = 11 10 10 01 01 00 00 ~ array of log2(sizeof(arr_type_elem)) */
|
||||
/** 0x3a50 = 11 10 10 01 01 00 00 ~ array of log2(sizeof(arr_type_elem)) */
|
||||
#define CV_ELEM_SIZE(type) \
|
||||
(CV_MAT_CN(type) << ((((sizeof(size_t)/4+1)*16384|0x3a50) >> CV_MAT_DEPTH(type)*2) & 3))
|
||||
|
||||
@@ -249,4 +510,6 @@
|
||||
# endif
|
||||
#endif
|
||||
|
||||
//! @}
|
||||
|
||||
#endif // __OPENCV_CORE_CVDEF_H__
|
||||
|
302
modules/core/include/opencv2/core/fast_math.hpp
Normal file
302
modules/core/include/opencv2/core/fast_math.hpp
Normal file
@@ -0,0 +1,302 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
|
||||
// Copyright (C) 2015, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef __OPENCV_CORE_FAST_MATH_HPP__
|
||||
#define __OPENCV_CORE_FAST_MATH_HPP__
|
||||
|
||||
#include "opencv2/core/cvdef.h"
|
||||
|
||||
//! @addtogroup core_utils
|
||||
//! @{
|
||||
|
||||
/****************************************************************************************\
|
||||
* fast math *
|
||||
\****************************************************************************************/
|
||||
|
||||
#if defined __BORLANDC__
|
||||
# include <fastmath.h>
|
||||
#elif defined __cplusplus
|
||||
# include <cmath>
|
||||
#else
|
||||
# include <math.h>
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_TEGRA_OPTIMIZATION
|
||||
# include "tegra_round.hpp"
|
||||
#endif
|
||||
|
||||
#if CV_VFP
|
||||
// 1. general scheme
|
||||
#define ARM_ROUND(_value, _asm_string) \
|
||||
int res; \
|
||||
float temp; \
|
||||
asm(_asm_string : [res] "=r" (res), [temp] "=w" (temp) : [value] "w" (_value)); \
|
||||
return res
|
||||
// 2. version for double
|
||||
#ifdef __clang__
|
||||
#define ARM_ROUND_DBL(value) ARM_ROUND(value, "vcvtr.s32.f64 %[temp], %[value] \n vmov %[res], %[temp]")
|
||||
#else
|
||||
#define ARM_ROUND_DBL(value) ARM_ROUND(value, "vcvtr.s32.f64 %[temp], %P[value] \n vmov %[res], %[temp]")
|
||||
#endif
|
||||
// 3. version for float
|
||||
#define ARM_ROUND_FLT(value) ARM_ROUND(value, "vcvtr.s32.f32 %[temp], %[value]\n vmov %[res], %[temp]")
|
||||
#endif // CV_VFP
|
||||
|
||||
/** @brief Rounds floating-point number to the nearest integer
|
||||
|
||||
@param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the
|
||||
result is not defined.
|
||||
*/
|
||||
CV_INLINE int
|
||||
cvRound( double value )
|
||||
{
|
||||
#if ((defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __x86_64__ \
|
||||
&& defined __SSE2__ && !defined __APPLE__)) && !defined(__CUDACC__)
|
||||
__m128d t = _mm_set_sd( value );
|
||||
return _mm_cvtsd_si32(t);
|
||||
#elif defined _MSC_VER && defined _M_IX86
|
||||
int t;
|
||||
__asm
|
||||
{
|
||||
fld value;
|
||||
fistp t;
|
||||
}
|
||||
return t;
|
||||
#elif ((defined _MSC_VER && defined _M_ARM) || defined CV_ICC || \
|
||||
defined __GNUC__) && defined HAVE_TEGRA_OPTIMIZATION
|
||||
TEGRA_ROUND_DBL(value);
|
||||
#elif defined CV_ICC || defined __GNUC__
|
||||
# if CV_VFP
|
||||
ARM_ROUND_DBL(value);
|
||||
# else
|
||||
return (int)lrint(value);
|
||||
# endif
|
||||
#else
|
||||
/* it's ok if round does not comply with IEEE754 standard;
|
||||
the tests should allow +/-1 difference when the tested functions use round */
|
||||
return (int)(value + (value >= 0 ? 0.5 : -0.5));
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
/** @brief Rounds floating-point number to the nearest integer not larger than the original.
|
||||
|
||||
The function computes an integer i such that:
|
||||
\f[i \le \texttt{value} < i+1\f]
|
||||
@param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the
|
||||
result is not defined.
|
||||
*/
|
||||
CV_INLINE int cvFloor( double value )
|
||||
{
|
||||
#if (defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__ && !defined __APPLE__)) && !defined(__CUDACC__)
|
||||
__m128d t = _mm_set_sd( value );
|
||||
int i = _mm_cvtsd_si32(t);
|
||||
return i - _mm_movemask_pd(_mm_cmplt_sd(t, _mm_cvtsi32_sd(t,i)));
|
||||
#elif defined __GNUC__
|
||||
int i = (int)value;
|
||||
return i - (i > value);
|
||||
#else
|
||||
int i = cvRound(value);
|
||||
float diff = (float)(value - i);
|
||||
return i - (diff < 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
/** @brief Rounds floating-point number to the nearest integer not smaller than the original.
|
||||
|
||||
The function computes an integer i such that:
|
||||
\f[i \le \texttt{value} < i+1\f]
|
||||
@param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the
|
||||
result is not defined.
|
||||
*/
|
||||
CV_INLINE int cvCeil( double value )
|
||||
{
|
||||
#if (defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__&& !defined __APPLE__)) && !defined(__CUDACC__)
|
||||
__m128d t = _mm_set_sd( value );
|
||||
int i = _mm_cvtsd_si32(t);
|
||||
return i + _mm_movemask_pd(_mm_cmplt_sd(_mm_cvtsi32_sd(t,i), t));
|
||||
#elif defined __GNUC__
|
||||
int i = (int)value;
|
||||
return i + (i < value);
|
||||
#else
|
||||
int i = cvRound(value);
|
||||
float diff = (float)(i - value);
|
||||
return i + (diff < 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
/** @brief Determines if the argument is Not A Number.
|
||||
|
||||
@param value The input floating-point value
|
||||
|
||||
The function returns 1 if the argument is Not A Number (as defined by IEEE754 standard), 0
|
||||
otherwise. */
|
||||
CV_INLINE int cvIsNaN( double value )
|
||||
{
|
||||
Cv64suf ieee754;
|
||||
ieee754.f = value;
|
||||
return ((unsigned)(ieee754.u >> 32) & 0x7fffffff) +
|
||||
((unsigned)ieee754.u != 0) > 0x7ff00000;
|
||||
}
|
||||
|
||||
/** @brief Determines if the argument is Infinity.
|
||||
|
||||
@param value The input floating-point value
|
||||
|
||||
The function returns 1 if the argument is a plus or minus infinity (as defined by IEEE754 standard)
|
||||
and 0 otherwise. */
|
||||
CV_INLINE int cvIsInf( double value )
|
||||
{
|
||||
Cv64suf ieee754;
|
||||
ieee754.f = value;
|
||||
return ((unsigned)(ieee754.u >> 32) & 0x7fffffff) == 0x7ff00000 &&
|
||||
(unsigned)ieee754.u == 0;
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
/** @overload */
|
||||
CV_INLINE int cvRound(float value)
|
||||
{
|
||||
#if ((defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __x86_64__ && \
|
||||
defined __SSE2__ && !defined __APPLE__)) && !defined(__CUDACC__)
|
||||
__m128 t = _mm_set_ss( value );
|
||||
return _mm_cvtss_si32(t);
|
||||
#elif defined _MSC_VER && defined _M_IX86
|
||||
int t;
|
||||
__asm
|
||||
{
|
||||
fld value;
|
||||
fistp t;
|
||||
}
|
||||
return t;
|
||||
#elif ((defined _MSC_VER && defined _M_ARM) || defined CV_ICC || \
|
||||
defined __GNUC__) && defined HAVE_TEGRA_OPTIMIZATION
|
||||
TEGRA_ROUND_FLT(value);
|
||||
#elif defined CV_ICC || defined __GNUC__
|
||||
# if CV_VFP
|
||||
ARM_ROUND_FLT(value);
|
||||
# else
|
||||
return (int)lrintf(value);
|
||||
# endif
|
||||
#else
|
||||
/* it's ok if round does not comply with IEEE754 standard;
|
||||
the tests should allow +/-1 difference when the tested functions use round */
|
||||
return (int)(value + (value >= 0 ? 0.5f : -0.5f));
|
||||
#endif
|
||||
}
|
||||
|
||||
/** @overload */
|
||||
CV_INLINE int cvRound( int value )
|
||||
{
|
||||
return value;
|
||||
}
|
||||
|
||||
/** @overload */
|
||||
CV_INLINE int cvFloor( float value )
|
||||
{
|
||||
#if (defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__ && !defined __APPLE__)) && !defined(__CUDACC__)
|
||||
__m128 t = _mm_set_ss( value );
|
||||
int i = _mm_cvtss_si32(t);
|
||||
return i - _mm_movemask_ps(_mm_cmplt_ss(t, _mm_cvtsi32_ss(t,i)));
|
||||
#elif defined __GNUC__
|
||||
int i = (int)value;
|
||||
return i - (i > value);
|
||||
#else
|
||||
int i = cvRound(value);
|
||||
float diff = (float)(value - i);
|
||||
return i - (diff < 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
/** @overload */
|
||||
CV_INLINE int cvFloor( int value )
|
||||
{
|
||||
return value;
|
||||
}
|
||||
|
||||
/** @overload */
|
||||
CV_INLINE int cvCeil( float value )
|
||||
{
|
||||
#if (defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__&& !defined __APPLE__)) && !defined(__CUDACC__)
|
||||
__m128 t = _mm_set_ss( value );
|
||||
int i = _mm_cvtss_si32(t);
|
||||
return i + _mm_movemask_ps(_mm_cmplt_ss(_mm_cvtsi32_ss(t,i), t));
|
||||
#elif defined __GNUC__
|
||||
int i = (int)value;
|
||||
return i + (i < value);
|
||||
#else
|
||||
int i = cvRound(value);
|
||||
float diff = (float)(i - value);
|
||||
return i + (diff < 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
/** @overload */
|
||||
CV_INLINE int cvCeil( int value )
|
||||
{
|
||||
return value;
|
||||
}
|
||||
|
||||
/** @overload */
|
||||
CV_INLINE int cvIsNaN( float value )
|
||||
{
|
||||
Cv32suf ieee754;
|
||||
ieee754.f = value;
|
||||
return (ieee754.u & 0x7fffffff) > 0x7f800000;
|
||||
}
|
||||
|
||||
/** @overload */
|
||||
CV_INLINE int cvIsInf( float value )
|
||||
{
|
||||
Cv32suf ieee754;
|
||||
ieee754.f = value;
|
||||
return (ieee754.u & 0x7fffffff) == 0x7f800000;
|
||||
}
|
||||
|
||||
#endif // __cplusplus
|
||||
|
||||
//! @} core_utils
|
||||
|
||||
#endif
|
218
modules/core/include/opencv2/core/hal/hal.hpp
Normal file
218
modules/core/include/opencv2/core/hal/hal.hpp
Normal file
@@ -0,0 +1,218 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
|
||||
// Copyright (C) 2015, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef __OPENCV_HAL_HPP__
|
||||
#define __OPENCV_HAL_HPP__
|
||||
|
||||
#include "opencv2/core/cvdef.h"
|
||||
#include "opencv2/core/hal/interface.h"
|
||||
|
||||
//! @cond IGNORED
|
||||
#define CALL_HAL(name, fun, ...) \
|
||||
int res = fun(__VA_ARGS__); \
|
||||
if (res == CV_HAL_ERROR_OK) \
|
||||
return; \
|
||||
else if (res != CV_HAL_ERROR_NOT_IMPLEMENTED) \
|
||||
CV_Error_(cv::Error::StsInternal, \
|
||||
("HAL implementation " CVAUX_STR(name) " ==> " CVAUX_STR(fun) " returned %d (0x%08x)", res, res));
|
||||
//! @endcond
|
||||
|
||||
|
||||
namespace cv { namespace hal {
|
||||
|
||||
//! @addtogroup core_hal_functions
|
||||
//! @{
|
||||
|
||||
CV_EXPORTS int normHamming(const uchar* a, int n);
|
||||
CV_EXPORTS int normHamming(const uchar* a, const uchar* b, int n);
|
||||
|
||||
CV_EXPORTS int normHamming(const uchar* a, int n, int cellSize);
|
||||
CV_EXPORTS int normHamming(const uchar* a, const uchar* b, int n, int cellSize);
|
||||
|
||||
CV_EXPORTS int LU32f(float* A, size_t astep, int m, float* b, size_t bstep, int n);
|
||||
CV_EXPORTS int LU64f(double* A, size_t astep, int m, double* b, size_t bstep, int n);
|
||||
CV_EXPORTS bool Cholesky32f(float* A, size_t astep, int m, float* b, size_t bstep, int n);
|
||||
CV_EXPORTS bool Cholesky64f(double* A, size_t astep, int m, double* b, size_t bstep, int n);
|
||||
|
||||
CV_EXPORTS int normL1_(const uchar* a, const uchar* b, int n);
|
||||
CV_EXPORTS float normL1_(const float* a, const float* b, int n);
|
||||
CV_EXPORTS float normL2Sqr_(const float* a, const float* b, int n);
|
||||
|
||||
CV_EXPORTS void exp32f(const float* src, float* dst, int n);
|
||||
CV_EXPORTS void exp64f(const double* src, double* dst, int n);
|
||||
CV_EXPORTS void log32f(const float* src, float* dst, int n);
|
||||
CV_EXPORTS void log64f(const double* src, double* dst, int n);
|
||||
|
||||
CV_EXPORTS void fastAtan2(const float* y, const float* x, float* dst, int n, bool angleInDegrees);
|
||||
CV_EXPORTS void magnitude32f(const float* x, const float* y, float* dst, int n);
|
||||
CV_EXPORTS void magnitude64f(const double* x, const double* y, double* dst, int n);
|
||||
CV_EXPORTS void sqrt32f(const float* src, float* dst, int len);
|
||||
CV_EXPORTS void sqrt64f(const double* src, double* dst, int len);
|
||||
CV_EXPORTS void invSqrt32f(const float* src, float* dst, int len);
|
||||
CV_EXPORTS void invSqrt64f(const double* src, double* dst, int len);
|
||||
|
||||
CV_EXPORTS void split8u(const uchar* src, uchar** dst, int len, int cn );
|
||||
CV_EXPORTS void split16u(const ushort* src, ushort** dst, int len, int cn );
|
||||
CV_EXPORTS void split32s(const int* src, int** dst, int len, int cn );
|
||||
CV_EXPORTS void split64s(const int64* src, int64** dst, int len, int cn );
|
||||
|
||||
CV_EXPORTS void merge8u(const uchar** src, uchar* dst, int len, int cn );
|
||||
CV_EXPORTS void merge16u(const ushort** src, ushort* dst, int len, int cn );
|
||||
CV_EXPORTS void merge32s(const int** src, int* dst, int len, int cn );
|
||||
CV_EXPORTS void merge64s(const int64** src, int64* dst, int len, int cn );
|
||||
|
||||
CV_EXPORTS void add8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void add8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void add16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void add16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void add32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void add32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void add64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* );
|
||||
|
||||
CV_EXPORTS void sub8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void sub8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void sub16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void sub16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void sub32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void sub32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void sub64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* );
|
||||
|
||||
CV_EXPORTS void max8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void max8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void max16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void max16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void max32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void max32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void max64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* );
|
||||
|
||||
CV_EXPORTS void min8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void min8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void min16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void min16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void min32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void min32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void min64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* );
|
||||
|
||||
CV_EXPORTS void absdiff8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void absdiff8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void absdiff16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void absdiff16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void absdiff32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void absdiff32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void absdiff64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* );
|
||||
|
||||
CV_EXPORTS void and8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void or8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void xor8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
|
||||
CV_EXPORTS void not8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
|
||||
|
||||
CV_EXPORTS void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
|
||||
CV_EXPORTS void cmp8s(const schar* src1, size_t step1, const schar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
|
||||
CV_EXPORTS void cmp16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
|
||||
CV_EXPORTS void cmp16s(const short* src1, size_t step1, const short* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
|
||||
CV_EXPORTS void cmp32s(const int* src1, size_t step1, const int* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
|
||||
CV_EXPORTS void cmp32f(const float* src1, size_t step1, const float* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
|
||||
CV_EXPORTS void cmp64f(const double* src1, size_t step1, const double* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
|
||||
|
||||
CV_EXPORTS void mul8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* scale);
|
||||
CV_EXPORTS void mul8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* scale);
|
||||
CV_EXPORTS void mul16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* scale);
|
||||
CV_EXPORTS void mul16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* scale);
|
||||
CV_EXPORTS void mul32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* scale);
|
||||
CV_EXPORTS void mul32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scale);
|
||||
CV_EXPORTS void mul64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scale);
|
||||
|
||||
CV_EXPORTS void div8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* scale);
|
||||
CV_EXPORTS void div8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* scale);
|
||||
CV_EXPORTS void div16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* scale);
|
||||
CV_EXPORTS void div16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* scale);
|
||||
CV_EXPORTS void div32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* scale);
|
||||
CV_EXPORTS void div32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scale);
|
||||
CV_EXPORTS void div64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scale);
|
||||
|
||||
CV_EXPORTS void recip8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* scale);
|
||||
CV_EXPORTS void recip8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* scale);
|
||||
CV_EXPORTS void recip16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* scale);
|
||||
CV_EXPORTS void recip16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* scale);
|
||||
CV_EXPORTS void recip32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* scale);
|
||||
CV_EXPORTS void recip32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scale);
|
||||
CV_EXPORTS void recip64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scale);
|
||||
|
||||
CV_EXPORTS void addWeighted8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _scalars );
|
||||
CV_EXPORTS void addWeighted8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* scalars );
|
||||
CV_EXPORTS void addWeighted16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* scalars );
|
||||
CV_EXPORTS void addWeighted16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* scalars );
|
||||
CV_EXPORTS void addWeighted32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* scalars );
|
||||
CV_EXPORTS void addWeighted32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scalars );
|
||||
CV_EXPORTS void addWeighted64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scalars );
|
||||
|
||||
//! @} core_hal
|
||||
|
||||
//=============================================================================
|
||||
// for binary compatibility with 3.0
|
||||
|
||||
//! @cond IGNORED
|
||||
|
||||
CV_EXPORTS int LU(float* A, size_t astep, int m, float* b, size_t bstep, int n);
|
||||
CV_EXPORTS int LU(double* A, size_t astep, int m, double* b, size_t bstep, int n);
|
||||
CV_EXPORTS bool Cholesky(float* A, size_t astep, int m, float* b, size_t bstep, int n);
|
||||
CV_EXPORTS bool Cholesky(double* A, size_t astep, int m, double* b, size_t bstep, int n);
|
||||
|
||||
CV_EXPORTS void exp(const float* src, float* dst, int n);
|
||||
CV_EXPORTS void exp(const double* src, double* dst, int n);
|
||||
CV_EXPORTS void log(const float* src, float* dst, int n);
|
||||
CV_EXPORTS void log(const double* src, double* dst, int n);
|
||||
|
||||
CV_EXPORTS void magnitude(const float* x, const float* y, float* dst, int n);
|
||||
CV_EXPORTS void magnitude(const double* x, const double* y, double* dst, int n);
|
||||
CV_EXPORTS void sqrt(const float* src, float* dst, int len);
|
||||
CV_EXPORTS void sqrt(const double* src, double* dst, int len);
|
||||
CV_EXPORTS void invSqrt(const float* src, float* dst, int len);
|
||||
CV_EXPORTS void invSqrt(const double* src, double* dst, int len);
|
||||
|
||||
//! @endcond
|
||||
|
||||
}} //cv::hal
|
||||
|
||||
#endif //__OPENCV_HAL_HPP__
|
69
modules/core/include/opencv2/core/hal/interface.h
Normal file
69
modules/core/include/opencv2/core/hal/interface.h
Normal file
@@ -0,0 +1,69 @@
|
||||
#ifndef _HAL_INTERFACE_HPP_INCLUDED_
|
||||
#define _HAL_INTERFACE_HPP_INCLUDED_
|
||||
|
||||
//! @addtogroup core_hal_interface
|
||||
//! @{
|
||||
|
||||
#define CV_HAL_ERROR_OK 0
|
||||
#define CV_HAL_ERROR_NOT_IMPLEMENTED 1
|
||||
#define CV_HAL_ERROR_UNKNOWN -1
|
||||
|
||||
#define CV_HAL_CMP_EQ 0
|
||||
#define CV_HAL_CMP_GT 1
|
||||
#define CV_HAL_CMP_GE 2
|
||||
#define CV_HAL_CMP_LT 3
|
||||
#define CV_HAL_CMP_LE 4
|
||||
#define CV_HAL_CMP_NE 5
|
||||
|
||||
#ifdef __cplusplus
|
||||
#include <cstddef>
|
||||
#else
|
||||
#include <stddef.h>
|
||||
#endif
|
||||
|
||||
/* primitive types */
|
||||
/*
|
||||
schar - signed 1 byte integer
|
||||
uchar - unsigned 1 byte integer
|
||||
short - signed 2 byte integer
|
||||
ushort - unsigned 2 byte integer
|
||||
int - signed 4 byte integer
|
||||
uint - unsigned 4 byte integer
|
||||
int64 - signed 8 byte integer
|
||||
uint64 - unsigned 8 byte integer
|
||||
*/
|
||||
|
||||
#if !defined _MSC_VER && !defined __BORLANDC__
|
||||
# if defined __cplusplus && __cplusplus >= 201103L && !defined __APPLE__
|
||||
# include <cstdint>
|
||||
typedef std::uint32_t uint;
|
||||
# else
|
||||
# include <stdint.h>
|
||||
typedef uint32_t uint;
|
||||
# endif
|
||||
#else
|
||||
typedef unsigned uint;
|
||||
#endif
|
||||
|
||||
typedef signed char schar;
|
||||
|
||||
#ifndef __IPL_H__
|
||||
typedef unsigned char uchar;
|
||||
typedef unsigned short ushort;
|
||||
#endif
|
||||
|
||||
#if defined _MSC_VER || defined __BORLANDC__
|
||||
typedef __int64 int64;
|
||||
typedef unsigned __int64 uint64;
|
||||
# define CV_BIG_INT(n) n##I64
|
||||
# define CV_BIG_UINT(n) n##UI64
|
||||
#else
|
||||
typedef int64_t int64;
|
||||
typedef uint64_t uint64;
|
||||
# define CV_BIG_INT(n) n##LL
|
||||
# define CV_BIG_UINT(n) n##ULL
|
||||
#endif
|
||||
|
||||
//! @}
|
||||
|
||||
#endif
|
320
modules/core/include/opencv2/core/hal/intrin.hpp
Normal file
320
modules/core/include/opencv2/core/hal/intrin.hpp
Normal file
@@ -0,0 +1,320 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
|
||||
// Copyright (C) 2015, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef __OPENCV_HAL_INTRIN_HPP__
|
||||
#define __OPENCV_HAL_INTRIN_HPP__
|
||||
|
||||
#include <cmath>
|
||||
#include <float.h>
|
||||
#include <stdlib.h>
|
||||
#include "opencv2/core/cvdef.h"
|
||||
|
||||
#define OPENCV_HAL_ADD(a, b) ((a) + (b))
|
||||
#define OPENCV_HAL_AND(a, b) ((a) & (b))
|
||||
#define OPENCV_HAL_NOP(a) (a)
|
||||
#define OPENCV_HAL_1ST(a, b) (a)
|
||||
|
||||
// unlike HAL API, which is in cv::hal,
|
||||
// we put intrinsics into cv namespace to make its
|
||||
// access from within opencv code more accessible
|
||||
namespace cv {
|
||||
|
||||
//! @addtogroup core_hal_intrin
|
||||
//! @{
|
||||
|
||||
//! @cond IGNORED
|
||||
template<typename _Tp> struct V_TypeTraits
|
||||
{
|
||||
typedef _Tp int_type;
|
||||
typedef _Tp uint_type;
|
||||
typedef _Tp abs_type;
|
||||
typedef _Tp sum_type;
|
||||
|
||||
enum { delta = 0, shift = 0 };
|
||||
|
||||
static int_type reinterpret_int(_Tp x) { return x; }
|
||||
static uint_type reinterpet_uint(_Tp x) { return x; }
|
||||
static _Tp reinterpret_from_int(int_type x) { return (_Tp)x; }
|
||||
};
|
||||
|
||||
template<> struct V_TypeTraits<uchar>
|
||||
{
|
||||
typedef uchar value_type;
|
||||
typedef schar int_type;
|
||||
typedef uchar uint_type;
|
||||
typedef uchar abs_type;
|
||||
typedef int sum_type;
|
||||
|
||||
typedef ushort w_type;
|
||||
typedef unsigned q_type;
|
||||
|
||||
enum { delta = 128, shift = 8 };
|
||||
|
||||
static int_type reinterpret_int(value_type x) { return (int_type)x; }
|
||||
static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
|
||||
static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
|
||||
};
|
||||
|
||||
template<> struct V_TypeTraits<schar>
|
||||
{
|
||||
typedef schar value_type;
|
||||
typedef schar int_type;
|
||||
typedef uchar uint_type;
|
||||
typedef uchar abs_type;
|
||||
typedef int sum_type;
|
||||
|
||||
typedef short w_type;
|
||||
typedef int q_type;
|
||||
|
||||
enum { delta = 128, shift = 8 };
|
||||
|
||||
static int_type reinterpret_int(value_type x) { return (int_type)x; }
|
||||
static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
|
||||
static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
|
||||
};
|
||||
|
||||
template<> struct V_TypeTraits<ushort>
|
||||
{
|
||||
typedef ushort value_type;
|
||||
typedef short int_type;
|
||||
typedef ushort uint_type;
|
||||
typedef ushort abs_type;
|
||||
typedef int sum_type;
|
||||
|
||||
typedef unsigned w_type;
|
||||
typedef uchar nu_type;
|
||||
|
||||
enum { delta = 32768, shift = 16 };
|
||||
|
||||
static int_type reinterpret_int(value_type x) { return (int_type)x; }
|
||||
static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
|
||||
static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
|
||||
};
|
||||
|
||||
template<> struct V_TypeTraits<short>
|
||||
{
|
||||
typedef short value_type;
|
||||
typedef short int_type;
|
||||
typedef ushort uint_type;
|
||||
typedef ushort abs_type;
|
||||
typedef int sum_type;
|
||||
|
||||
typedef int w_type;
|
||||
typedef uchar nu_type;
|
||||
typedef schar n_type;
|
||||
|
||||
enum { delta = 128, shift = 8 };
|
||||
|
||||
static int_type reinterpret_int(value_type x) { return (int_type)x; }
|
||||
static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
|
||||
static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
|
||||
};
|
||||
|
||||
template<> struct V_TypeTraits<unsigned>
|
||||
{
|
||||
typedef unsigned value_type;
|
||||
typedef int int_type;
|
||||
typedef unsigned uint_type;
|
||||
typedef unsigned abs_type;
|
||||
typedef unsigned sum_type;
|
||||
|
||||
typedef uint64 w_type;
|
||||
typedef ushort nu_type;
|
||||
|
||||
static int_type reinterpret_int(value_type x) { return (int_type)x; }
|
||||
static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
|
||||
static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
|
||||
};
|
||||
|
||||
template<> struct V_TypeTraits<int>
|
||||
{
|
||||
typedef int value_type;
|
||||
typedef int int_type;
|
||||
typedef unsigned uint_type;
|
||||
typedef unsigned abs_type;
|
||||
typedef int sum_type;
|
||||
|
||||
typedef int64 w_type;
|
||||
typedef short n_type;
|
||||
typedef ushort nu_type;
|
||||
|
||||
static int_type reinterpret_int(value_type x) { return (int_type)x; }
|
||||
static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
|
||||
static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
|
||||
};
|
||||
|
||||
template<> struct V_TypeTraits<uint64>
|
||||
{
|
||||
typedef uint64 value_type;
|
||||
typedef int64 int_type;
|
||||
typedef uint64 uint_type;
|
||||
typedef uint64 abs_type;
|
||||
typedef uint64 sum_type;
|
||||
|
||||
typedef unsigned nu_type;
|
||||
|
||||
static int_type reinterpret_int(value_type x) { return (int_type)x; }
|
||||
static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
|
||||
static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
|
||||
};
|
||||
|
||||
template<> struct V_TypeTraits<int64>
|
||||
{
|
||||
typedef int64 value_type;
|
||||
typedef int64 int_type;
|
||||
typedef uint64 uint_type;
|
||||
typedef uint64 abs_type;
|
||||
typedef int64 sum_type;
|
||||
|
||||
typedef int nu_type;
|
||||
|
||||
static int_type reinterpret_int(value_type x) { return (int_type)x; }
|
||||
static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
|
||||
static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
|
||||
};
|
||||
|
||||
|
||||
template<> struct V_TypeTraits<float>
|
||||
{
|
||||
typedef float value_type;
|
||||
typedef int int_type;
|
||||
typedef unsigned uint_type;
|
||||
typedef float abs_type;
|
||||
typedef float sum_type;
|
||||
|
||||
typedef double w_type;
|
||||
|
||||
static int_type reinterpret_int(value_type x)
|
||||
{
|
||||
Cv32suf u;
|
||||
u.f = x;
|
||||
return u.i;
|
||||
}
|
||||
static uint_type reinterpet_uint(value_type x)
|
||||
{
|
||||
Cv32suf u;
|
||||
u.f = x;
|
||||
return u.u;
|
||||
}
|
||||
static value_type reinterpret_from_int(int_type x)
|
||||
{
|
||||
Cv32suf u;
|
||||
u.i = x;
|
||||
return u.f;
|
||||
}
|
||||
};
|
||||
|
||||
template<> struct V_TypeTraits<double>
|
||||
{
|
||||
typedef double value_type;
|
||||
typedef int64 int_type;
|
||||
typedef uint64 uint_type;
|
||||
typedef double abs_type;
|
||||
typedef double sum_type;
|
||||
static int_type reinterpret_int(value_type x)
|
||||
{
|
||||
Cv64suf u;
|
||||
u.f = x;
|
||||
return u.i;
|
||||
}
|
||||
static uint_type reinterpet_uint(value_type x)
|
||||
{
|
||||
Cv64suf u;
|
||||
u.f = x;
|
||||
return u.u;
|
||||
}
|
||||
static value_type reinterpret_from_int(int_type x)
|
||||
{
|
||||
Cv64suf u;
|
||||
u.i = x;
|
||||
return u.f;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T> struct V_SIMD128Traits
|
||||
{
|
||||
enum { nlanes = 16 / sizeof(T) };
|
||||
};
|
||||
|
||||
//! @endcond
|
||||
|
||||
//! @}
|
||||
|
||||
}
|
||||
|
||||
#ifdef CV_DOXYGEN
|
||||
# undef CV_SSE2
|
||||
# undef CV_NEON
|
||||
#endif
|
||||
|
||||
#if CV_SSE2
|
||||
|
||||
#include "opencv2/core/hal/intrin_sse.hpp"
|
||||
|
||||
#elif CV_NEON
|
||||
|
||||
#include "opencv2/core/hal/intrin_neon.hpp"
|
||||
|
||||
#else
|
||||
|
||||
#include "opencv2/core/hal/intrin_cpp.hpp"
|
||||
|
||||
#endif
|
||||
|
||||
//! @addtogroup core_hal_intrin
|
||||
//! @{
|
||||
|
||||
#ifndef CV_SIMD128
|
||||
//! Set to 1 if current compiler supports vector extensions (NEON or SSE is enabled)
|
||||
#define CV_SIMD128 0
|
||||
#endif
|
||||
|
||||
#ifndef CV_SIMD128_64F
|
||||
//! Set to 1 if current intrinsics implementation supports 64-bit float vectors
|
||||
#define CV_SIMD128_64F 0
|
||||
#endif
|
||||
|
||||
//! @}
|
||||
|
||||
#endif
|
1738
modules/core/include/opencv2/core/hal/intrin_cpp.hpp
Normal file
1738
modules/core/include/opencv2/core/hal/intrin_cpp.hpp
Normal file
File diff suppressed because it is too large
Load Diff
864
modules/core/include/opencv2/core/hal/intrin_neon.hpp
Normal file
864
modules/core/include/opencv2/core/hal/intrin_neon.hpp
Normal file
@@ -0,0 +1,864 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
|
||||
// Copyright (C) 2015, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef __OPENCV_HAL_INTRIN_NEON_HPP__
|
||||
#define __OPENCV_HAL_INTRIN_NEON_HPP__
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
namespace cv
|
||||
{
|
||||
|
||||
//! @cond IGNORED
|
||||
|
||||
#define CV_SIMD128 1
|
||||
|
||||
struct v_uint8x16
|
||||
{
|
||||
typedef uchar lane_type;
|
||||
enum { nlanes = 16 };
|
||||
|
||||
v_uint8x16() {}
|
||||
explicit v_uint8x16(uint8x16_t v) : val(v) {}
|
||||
v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
|
||||
uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
|
||||
{
|
||||
uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
|
||||
val = vld1q_u8(v);
|
||||
}
|
||||
uchar get0() const
|
||||
{
|
||||
return vgetq_lane_u8(val, 0);
|
||||
}
|
||||
|
||||
uint8x16_t val;
|
||||
};
|
||||
|
||||
struct v_int8x16
|
||||
{
|
||||
typedef schar lane_type;
|
||||
enum { nlanes = 16 };
|
||||
|
||||
v_int8x16() {}
|
||||
explicit v_int8x16(int8x16_t v) : val(v) {}
|
||||
v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
|
||||
schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
|
||||
{
|
||||
schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
|
||||
val = vld1q_s8(v);
|
||||
}
|
||||
schar get0() const
|
||||
{
|
||||
return vgetq_lane_s8(val, 0);
|
||||
}
|
||||
|
||||
int8x16_t val;
|
||||
};
|
||||
|
||||
struct v_uint16x8
|
||||
{
|
||||
typedef ushort lane_type;
|
||||
enum { nlanes = 8 };
|
||||
|
||||
v_uint16x8() {}
|
||||
explicit v_uint16x8(uint16x8_t v) : val(v) {}
|
||||
v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
|
||||
{
|
||||
ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
|
||||
val = vld1q_u16(v);
|
||||
}
|
||||
ushort get0() const
|
||||
{
|
||||
return vgetq_lane_u16(val, 0);
|
||||
}
|
||||
|
||||
uint16x8_t val;
|
||||
};
|
||||
|
||||
struct v_int16x8
|
||||
{
|
||||
typedef short lane_type;
|
||||
enum { nlanes = 8 };
|
||||
|
||||
v_int16x8() {}
|
||||
explicit v_int16x8(int16x8_t v) : val(v) {}
|
||||
v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
|
||||
{
|
||||
short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
|
||||
val = vld1q_s16(v);
|
||||
}
|
||||
short get0() const
|
||||
{
|
||||
return vgetq_lane_s16(val, 0);
|
||||
}
|
||||
|
||||
int16x8_t val;
|
||||
};
|
||||
|
||||
struct v_uint32x4
|
||||
{
|
||||
typedef unsigned lane_type;
|
||||
enum { nlanes = 4 };
|
||||
|
||||
v_uint32x4() {}
|
||||
explicit v_uint32x4(uint32x4_t v) : val(v) {}
|
||||
v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
|
||||
{
|
||||
unsigned v[] = {v0, v1, v2, v3};
|
||||
val = vld1q_u32(v);
|
||||
}
|
||||
unsigned get0() const
|
||||
{
|
||||
return vgetq_lane_u32(val, 0);
|
||||
}
|
||||
|
||||
uint32x4_t val;
|
||||
};
|
||||
|
||||
struct v_int32x4
|
||||
{
|
||||
typedef int lane_type;
|
||||
enum { nlanes = 4 };
|
||||
|
||||
v_int32x4() {}
|
||||
explicit v_int32x4(int32x4_t v) : val(v) {}
|
||||
v_int32x4(int v0, int v1, int v2, int v3)
|
||||
{
|
||||
int v[] = {v0, v1, v2, v3};
|
||||
val = vld1q_s32(v);
|
||||
}
|
||||
int get0() const
|
||||
{
|
||||
return vgetq_lane_s32(val, 0);
|
||||
}
|
||||
int32x4_t val;
|
||||
};
|
||||
|
||||
struct v_float32x4
|
||||
{
|
||||
typedef float lane_type;
|
||||
enum { nlanes = 4 };
|
||||
|
||||
v_float32x4() {}
|
||||
explicit v_float32x4(float32x4_t v) : val(v) {}
|
||||
v_float32x4(float v0, float v1, float v2, float v3)
|
||||
{
|
||||
float v[] = {v0, v1, v2, v3};
|
||||
val = vld1q_f32(v);
|
||||
}
|
||||
float get0() const
|
||||
{
|
||||
return vgetq_lane_f32(val, 0);
|
||||
}
|
||||
float32x4_t val;
|
||||
};
|
||||
|
||||
struct v_uint64x2
|
||||
{
|
||||
typedef uint64 lane_type;
|
||||
enum { nlanes = 2 };
|
||||
|
||||
v_uint64x2() {}
|
||||
explicit v_uint64x2(uint64x2_t v) : val(v) {}
|
||||
v_uint64x2(unsigned v0, unsigned v1)
|
||||
{
|
||||
uint64 v[] = {v0, v1};
|
||||
val = vld1q_u64(v);
|
||||
}
|
||||
uint64 get0() const
|
||||
{
|
||||
return vgetq_lane_u64(val, 0);
|
||||
}
|
||||
uint64x2_t val;
|
||||
};
|
||||
|
||||
struct v_int64x2
|
||||
{
|
||||
typedef int64 lane_type;
|
||||
enum { nlanes = 2 };
|
||||
|
||||
v_int64x2() {}
|
||||
explicit v_int64x2(int64x2_t v) : val(v) {}
|
||||
v_int64x2(int v0, int v1)
|
||||
{
|
||||
int64 v[] = {v0, v1};
|
||||
val = vld1q_s64(v);
|
||||
}
|
||||
int64 get0() const
|
||||
{
|
||||
return vgetq_lane_s64(val, 0);
|
||||
}
|
||||
int64x2_t val;
|
||||
};
|
||||
|
||||
#define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \
|
||||
inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \
|
||||
inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \
|
||||
inline _Tpv##_t vreinterpretq_##suffix##_##suffix(_Tpv##_t v) { return v; } \
|
||||
inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(vreinterpretq_u8_##suffix(v.val)); } \
|
||||
inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(vreinterpretq_s8_##suffix(v.val)); } \
|
||||
inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8(vreinterpretq_u16_##suffix(v.val)); } \
|
||||
inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8(vreinterpretq_s16_##suffix(v.val)); } \
|
||||
inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4(vreinterpretq_u32_##suffix(v.val)); } \
|
||||
inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4(vreinterpretq_s32_##suffix(v.val)); } \
|
||||
inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2(vreinterpretq_u64_##suffix(v.val)); } \
|
||||
inline v_int64x2 v_reinterpret_as_s64(const v_##_Tpv& v) { return v_int64x2(vreinterpretq_s64_##suffix(v.val)); } \
|
||||
inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4(vreinterpretq_f32_##suffix(v.val)); }
|
||||
|
||||
OPENCV_HAL_IMPL_NEON_INIT(uint8x16, uchar, u8)
|
||||
OPENCV_HAL_IMPL_NEON_INIT(int8x16, schar, s8)
|
||||
OPENCV_HAL_IMPL_NEON_INIT(uint16x8, ushort, u16)
|
||||
OPENCV_HAL_IMPL_NEON_INIT(int16x8, short, s16)
|
||||
OPENCV_HAL_IMPL_NEON_INIT(uint32x4, unsigned, u32)
|
||||
OPENCV_HAL_IMPL_NEON_INIT(int32x4, int, s32)
|
||||
OPENCV_HAL_IMPL_NEON_INIT(uint64x2, uint64, u64)
|
||||
OPENCV_HAL_IMPL_NEON_INIT(int64x2, int64, s64)
|
||||
OPENCV_HAL_IMPL_NEON_INIT(float32x4, float, f32)
|
||||
|
||||
#define OPENCV_HAL_IMPL_NEON_PACK(_Tpvec, _Tp, hreg, suffix, _Tpwvec, wsuffix, pack, op) \
|
||||
inline _Tpvec v_##pack(const _Tpwvec& a, const _Tpwvec& b) \
|
||||
{ \
|
||||
hreg a1 = vqmov##op##_##wsuffix(a.val), b1 = vqmov##op##_##wsuffix(b.val); \
|
||||
return _Tpvec(vcombine_##suffix(a1, b1)); \
|
||||
} \
|
||||
inline void v_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
|
||||
{ \
|
||||
hreg a1 = vqmov##op##_##wsuffix(a.val); \
|
||||
vst1_##suffix(ptr, a1); \
|
||||
} \
|
||||
template<int n> inline \
|
||||
_Tpvec v_rshr_##pack(const _Tpwvec& a, const _Tpwvec& b) \
|
||||
{ \
|
||||
hreg a1 = vqrshr##op##_n_##wsuffix(a.val, n); \
|
||||
hreg b1 = vqrshr##op##_n_##wsuffix(b.val, n); \
|
||||
return _Tpvec(vcombine_##suffix(a1, b1)); \
|
||||
} \
|
||||
template<int n> inline \
|
||||
void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
|
||||
{ \
|
||||
hreg a1 = vqrshr##op##_n_##wsuffix(a.val, n); \
|
||||
vst1_##suffix(ptr, a1); \
|
||||
}
|
||||
|
||||
OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_uint16x8, u16, pack, n)
|
||||
OPENCV_HAL_IMPL_NEON_PACK(v_int8x16, schar, int8x8_t, s8, v_int16x8, s16, pack, n)
|
||||
OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_uint32x4, u32, pack, n)
|
||||
OPENCV_HAL_IMPL_NEON_PACK(v_int16x8, short, int16x4_t, s16, v_int32x4, s32, pack, n)
|
||||
OPENCV_HAL_IMPL_NEON_PACK(v_uint32x4, unsigned, uint32x2_t, u32, v_uint64x2, u64, pack, n)
|
||||
OPENCV_HAL_IMPL_NEON_PACK(v_int32x4, int, int32x2_t, s32, v_int64x2, s64, pack, n)
|
||||
|
||||
OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_int16x8, s16, pack_u, un)
|
||||
OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_int32x4, s32, pack_u, un)
|
||||
|
||||
inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
|
||||
const v_float32x4& m1, const v_float32x4& m2,
|
||||
const v_float32x4& m3)
|
||||
{
|
||||
float32x2_t vl = vget_low_f32(v.val), vh = vget_high_f32(v.val);
|
||||
float32x4_t res = vmulq_lane_f32(m0.val, vl, 0);
|
||||
res = vmlaq_lane_f32(res, m1.val, vl, 1);
|
||||
res = vmlaq_lane_f32(res, m2.val, vh, 0);
|
||||
res = vmlaq_lane_f32(res, m3.val, vh, 1);
|
||||
return v_float32x4(res);
|
||||
}
|
||||
|
||||
#define OPENCV_HAL_IMPL_NEON_BIN_OP(bin_op, _Tpvec, intrin) \
|
||||
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
return _Tpvec(intrin(a.val, b.val)); \
|
||||
} \
|
||||
inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
a.val = intrin(a.val, b.val); \
|
||||
return a; \
|
||||
}
|
||||
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint8x16, vqaddq_u8)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint8x16, vqsubq_u8)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int8x16, vqaddq_s8)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int8x16, vqsubq_s8)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint16x8, vqaddq_u16)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint16x8, vqsubq_u16)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_uint16x8, vmulq_u16)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int16x8, vqaddq_s16)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int16x8, vqsubq_s16)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int16x8, vmulq_s16)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int32x4, vaddq_s32)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int32x4, vsubq_s32)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int32x4, vmulq_s32)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint32x4, vaddq_u32)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint32x4, vsubq_u32)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_uint32x4, vmulq_u32)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_float32x4, vaddq_f32)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_float32x4, vsubq_f32)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_float32x4, vmulq_f32)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int64x2, vaddq_s64)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int64x2, vsubq_s64)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint64x2, vaddq_u64)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint64x2, vsubq_u64)
|
||||
|
||||
inline v_float32x4 operator / (const v_float32x4& a, const v_float32x4& b)
|
||||
{
|
||||
float32x4_t reciprocal = vrecpeq_f32(b.val);
|
||||
reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
|
||||
reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
|
||||
return v_float32x4(vmulq_f32(a.val, reciprocal));
|
||||
}
|
||||
inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b)
|
||||
{
|
||||
float32x4_t reciprocal = vrecpeq_f32(b.val);
|
||||
reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
|
||||
reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
|
||||
a.val = vmulq_f32(a.val, reciprocal);
|
||||
return a;
|
||||
}
|
||||
|
||||
inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
|
||||
v_int32x4& c, v_int32x4& d)
|
||||
{
|
||||
c.val = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
|
||||
d.val = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
|
||||
}
|
||||
|
||||
inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
|
||||
v_uint32x4& c, v_uint32x4& d)
|
||||
{
|
||||
c.val = vmull_u16(vget_low_u16(a.val), vget_low_u16(b.val));
|
||||
d.val = vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val));
|
||||
}
|
||||
|
||||
inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
|
||||
v_uint64x2& c, v_uint64x2& d)
|
||||
{
|
||||
c.val = vmull_u32(vget_low_u32(a.val), vget_low_u32(b.val));
|
||||
d.val = vmull_u32(vget_high_u32(a.val), vget_high_u32(b.val));
|
||||
}
|
||||
|
||||
inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
|
||||
{
|
||||
int32x4_t c = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
|
||||
int32x4_t d = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
|
||||
int32x4x2_t cd = vuzpq_s32(c, d);
|
||||
return v_int32x4(vaddq_s32(cd.val[0], cd.val[1]));
|
||||
}
|
||||
|
||||
#define OPENCV_HAL_IMPL_NEON_LOGIC_OP(_Tpvec, suffix) \
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(&, _Tpvec, vandq_##suffix) \
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(|, _Tpvec, vorrq_##suffix) \
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(^, _Tpvec, veorq_##suffix) \
|
||||
inline _Tpvec operator ~ (const _Tpvec& a) \
|
||||
{ \
|
||||
return _Tpvec(vreinterpretq_##suffix##_u8(vmvnq_u8(vreinterpretq_u8_##suffix(a.val)))); \
|
||||
}
|
||||
|
||||
OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint8x16, u8)
|
||||
OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int8x16, s8)
|
||||
OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint16x8, u16)
|
||||
OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int16x8, s16)
|
||||
OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint32x4, u32)
|
||||
OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int32x4, s32)
|
||||
OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint64x2, u64)
|
||||
OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int64x2, s64)
|
||||
|
||||
#define OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(bin_op, intrin) \
|
||||
inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
|
||||
{ \
|
||||
return v_float32x4(vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val)))); \
|
||||
} \
|
||||
inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
|
||||
{ \
|
||||
a.val = vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val))); \
|
||||
return a; \
|
||||
}
|
||||
|
||||
OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(&, vandq_s32)
|
||||
OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(|, vorrq_s32)
|
||||
OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(^, veorq_s32)
|
||||
|
||||
inline v_float32x4 operator ~ (const v_float32x4& a)
|
||||
{
|
||||
return v_float32x4(vreinterpretq_f32_s32(vmvnq_s32(vreinterpretq_s32_f32(a.val))));
|
||||
}
|
||||
|
||||
inline v_float32x4 v_sqrt(const v_float32x4& x)
|
||||
{
|
||||
float32x4_t x1 = vmaxq_f32(x.val, vdupq_n_f32(FLT_MIN));
|
||||
float32x4_t e = vrsqrteq_f32(x1);
|
||||
e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
|
||||
e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
|
||||
return v_float32x4(vmulq_f32(x.val, e));
|
||||
}
|
||||
|
||||
inline v_float32x4 v_invsqrt(const v_float32x4& x)
|
||||
{
|
||||
float32x4_t e = vrsqrteq_f32(x.val);
|
||||
e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e);
|
||||
e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e);
|
||||
return v_float32x4(e);
|
||||
}
|
||||
|
||||
inline v_float32x4 v_abs(v_float32x4 x)
|
||||
{ return v_float32x4(vabsq_f32(x.val)); }
|
||||
|
||||
// TODO: exp, log, sin, cos
|
||||
|
||||
#define OPENCV_HAL_IMPL_NEON_BIN_FUNC(_Tpvec, func, intrin) \
|
||||
inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
return _Tpvec(intrin(a.val, b.val)); \
|
||||
}
|
||||
|
||||
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_min, vminq_u8)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_max, vmaxq_u8)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_min, vminq_s8)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_max, vmaxq_s8)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_min, vminq_u16)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_max, vmaxq_u16)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_min, vminq_s16)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_max, vmaxq_s16)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_min, vminq_u32)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_max, vmaxq_u32)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_min, vminq_s32)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_max, vmaxq_s32)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_min, vminq_f32)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_max, vmaxq_f32)
|
||||
|
||||
|
||||
#define OPENCV_HAL_IMPL_NEON_INT_CMP_OP(_Tpvec, cast, suffix, not_suffix) \
|
||||
inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(cast(vceqq_##suffix(a.val, b.val))); } \
|
||||
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(cast(vmvnq_##not_suffix(vceqq_##suffix(a.val, b.val)))); } \
|
||||
inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(cast(vcltq_##suffix(a.val, b.val))); } \
|
||||
inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(cast(vcgtq_##suffix(a.val, b.val))); } \
|
||||
inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(cast(vcleq_##suffix(a.val, b.val))); } \
|
||||
inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return _Tpvec(cast(vcgeq_##suffix(a.val, b.val))); }
|
||||
|
||||
OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint8x16, OPENCV_HAL_NOP, u8, u8)
|
||||
OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int8x16, vreinterpretq_s8_u8, s8, u8)
|
||||
OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint16x8, OPENCV_HAL_NOP, u16, u16)
|
||||
OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int16x8, vreinterpretq_s16_u16, s16, u16)
|
||||
OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint32x4, OPENCV_HAL_NOP, u32, u32)
|
||||
OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int32x4, vreinterpretq_s32_u32, s32, u32)
|
||||
OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float32x4, vreinterpretq_f32_u32, f32, u32)
|
||||
|
||||
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_add_wrap, vaddq_u8)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_add_wrap, vaddq_s8)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_add_wrap, vaddq_u16)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_add_wrap, vaddq_s16)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_sub_wrap, vsubq_u8)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_sub_wrap, vsubq_s8)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_sub_wrap, vsubq_u16)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_sub_wrap, vsubq_s16)
|
||||
|
||||
// TODO: absdiff for signed integers
|
||||
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_absdiff, vabdq_u8)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_absdiff, vabdq_u16)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_absdiff, vabdq_u32)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_absdiff, vabdq_f32)
|
||||
|
||||
#define OPENCV_HAL_IMPL_NEON_BIN_FUNC2(_Tpvec, _Tpvec2, cast, func, intrin) \
|
||||
inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
return _Tpvec2(cast(intrin(a.val, b.val))); \
|
||||
}
|
||||
|
||||
OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int8x16, v_uint8x16, vreinterpretq_u8_s8, v_absdiff, vabdq_s8)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int16x8, v_uint16x8, vreinterpretq_u16_s16, v_absdiff, vabdq_s16)
|
||||
OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int32x4, v_uint32x4, vreinterpretq_u32_s32, v_absdiff, vabdq_s32)
|
||||
|
||||
inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b)
|
||||
{
|
||||
v_float32x4 x(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
|
||||
return v_sqrt(x);
|
||||
}
|
||||
|
||||
inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
|
||||
{
|
||||
return v_float32x4(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
|
||||
}
|
||||
|
||||
inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
|
||||
{
|
||||
return v_float32x4(vmlaq_f32(c.val, a.val, b.val));
|
||||
}
|
||||
|
||||
// trade efficiency for convenience
|
||||
#define OPENCV_HAL_IMPL_NEON_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \
|
||||
inline _Tpvec operator << (const _Tpvec& a, int n) \
|
||||
{ return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)n))); } \
|
||||
inline _Tpvec operator >> (const _Tpvec& a, int n) \
|
||||
{ return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)-n))); } \
|
||||
template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
|
||||
{ return _Tpvec(vshlq_n_##suffix(a.val, n)); } \
|
||||
template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
|
||||
{ return _Tpvec(vshrq_n_##suffix(a.val, n)); } \
|
||||
template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
|
||||
{ return _Tpvec(vrshrq_n_##suffix(a.val, n)); }
|
||||
|
||||
OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint8x16, u8, schar, s8)
|
||||
OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int8x16, s8, schar, s8)
|
||||
OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint16x8, u16, short, s16)
|
||||
OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int16x8, s16, short, s16)
|
||||
OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint32x4, u32, int, s32)
|
||||
OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int32x4, s32, int, s32)
|
||||
OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint64x2, u64, int64, s64)
|
||||
OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int64x2, s64, int64, s64)
|
||||
|
||||
#define OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(_Tpvec, _Tp, suffix) \
|
||||
inline _Tpvec v_load(const _Tp* ptr) \
|
||||
{ return _Tpvec(vld1q_##suffix(ptr)); } \
|
||||
inline _Tpvec v_load_aligned(const _Tp* ptr) \
|
||||
{ return _Tpvec(vld1q_##suffix(ptr)); } \
|
||||
inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
|
||||
{ return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr0), vld1_##suffix(ptr1))); } \
|
||||
inline void v_store(_Tp* ptr, const _Tpvec& a) \
|
||||
{ vst1q_##suffix(ptr, a.val); } \
|
||||
inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
|
||||
{ vst1q_##suffix(ptr, a.val); } \
|
||||
inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
|
||||
{ vst1_##suffix(ptr, vget_low_##suffix(a.val)); } \
|
||||
inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
|
||||
{ vst1_##suffix(ptr, vget_high_##suffix(a.val)); }
|
||||
|
||||
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint8x16, uchar, u8)
|
||||
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int8x16, schar, s8)
|
||||
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint16x8, ushort, u16)
|
||||
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int16x8, short, s16)
|
||||
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint32x4, unsigned, u32)
|
||||
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int32x4, int, s32)
|
||||
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint64x2, uint64, u64)
|
||||
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int64x2, int64, s64)
|
||||
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32)
|
||||
|
||||
#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \
|
||||
inline scalartype v_reduce_##func(const _Tpvec& a) \
|
||||
{ \
|
||||
scalartype CV_DECL_ALIGNED(16) buf[4]; \
|
||||
v_store_aligned(buf, a); \
|
||||
scalartype s0 = scalar_func(buf[0], buf[1]); \
|
||||
scalartype s1 = scalar_func(buf[2], buf[3]); \
|
||||
return scalar_func(s0, s1); \
|
||||
}
|
||||
|
||||
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, unsigned, sum, OPENCV_HAL_ADD)
|
||||
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max)
|
||||
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min)
|
||||
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int, sum, OPENCV_HAL_ADD)
|
||||
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int, max, std::max)
|
||||
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int, min, std::min)
|
||||
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float, sum, OPENCV_HAL_ADD)
|
||||
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float, max, std::max)
|
||||
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float, min, std::min)
|
||||
|
||||
inline int v_signmask(const v_uint8x16& a)
|
||||
{
|
||||
int8x8_t m0 = vcreate_s8(CV_BIG_UINT(0x0706050403020100));
|
||||
uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), vcombine_s8(m0, m0));
|
||||
uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v0)));
|
||||
return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 8);
|
||||
}
|
||||
inline int v_signmask(const v_int8x16& a)
|
||||
{ return v_signmask(v_reinterpret_as_u8(a)); }
|
||||
|
||||
inline int v_signmask(const v_uint16x8& a)
|
||||
{
|
||||
int16x4_t m0 = vcreate_s16(CV_BIG_UINT(0x0003000200010000));
|
||||
uint16x8_t v0 = vshlq_u16(vshrq_n_u16(a.val, 15), vcombine_s16(m0, m0));
|
||||
uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(v0));
|
||||
return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 4);
|
||||
}
|
||||
inline int v_signmask(const v_int16x8& a)
|
||||
{ return v_signmask(v_reinterpret_as_u16(a)); }
|
||||
|
||||
inline int v_signmask(const v_uint32x4& a)
|
||||
{
|
||||
int32x2_t m0 = vcreate_s32(CV_BIG_UINT(0x0000000100000000));
|
||||
uint32x4_t v0 = vshlq_u32(vshrq_n_u32(a.val, 31), vcombine_s32(m0, m0));
|
||||
uint64x2_t v1 = vpaddlq_u32(v0);
|
||||
return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 2);
|
||||
}
|
||||
inline int v_signmask(const v_int32x4& a)
|
||||
{ return v_signmask(v_reinterpret_as_u32(a)); }
|
||||
inline int v_signmask(const v_float32x4& a)
|
||||
{ return v_signmask(v_reinterpret_as_u32(a)); }
|
||||
|
||||
#define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \
|
||||
inline bool v_check_all(const v_##_Tpvec& a) \
|
||||
{ \
|
||||
_Tpvec##_t v0 = vshrq_n_##suffix(vmvnq_##suffix(a.val), shift); \
|
||||
uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
|
||||
return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) == 0; \
|
||||
} \
|
||||
inline bool v_check_any(const v_##_Tpvec& a) \
|
||||
{ \
|
||||
_Tpvec##_t v0 = vshrq_n_##suffix(a.val, shift); \
|
||||
uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
|
||||
return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) != 0; \
|
||||
}
|
||||
|
||||
OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint8x16, u8, 7)
|
||||
OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint16x8, u16, 15)
|
||||
OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint32x4, u32, 31)
|
||||
|
||||
inline bool v_check_all(const v_int8x16& a)
|
||||
{ return v_check_all(v_reinterpret_as_u8(a)); }
|
||||
inline bool v_check_all(const v_int16x8& a)
|
||||
{ return v_check_all(v_reinterpret_as_u16(a)); }
|
||||
inline bool v_check_all(const v_int32x4& a)
|
||||
{ return v_check_all(v_reinterpret_as_u32(a)); }
|
||||
inline bool v_check_all(const v_float32x4& a)
|
||||
{ return v_check_all(v_reinterpret_as_u32(a)); }
|
||||
|
||||
inline bool v_check_any(const v_int8x16& a)
|
||||
{ return v_check_any(v_reinterpret_as_u8(a)); }
|
||||
inline bool v_check_any(const v_int16x8& a)
|
||||
{ return v_check_any(v_reinterpret_as_u16(a)); }
|
||||
inline bool v_check_any(const v_int32x4& a)
|
||||
{ return v_check_any(v_reinterpret_as_u32(a)); }
|
||||
inline bool v_check_any(const v_float32x4& a)
|
||||
{ return v_check_any(v_reinterpret_as_u32(a)); }
|
||||
|
||||
#define OPENCV_HAL_IMPL_NEON_SELECT(_Tpvec, suffix, usuffix) \
|
||||
inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
return _Tpvec(vbslq_##suffix(vreinterpretq_##usuffix##_##suffix(mask.val), a.val, b.val)); \
|
||||
}
|
||||
|
||||
OPENCV_HAL_IMPL_NEON_SELECT(v_uint8x16, u8, u8)
|
||||
OPENCV_HAL_IMPL_NEON_SELECT(v_int8x16, s8, u8)
|
||||
OPENCV_HAL_IMPL_NEON_SELECT(v_uint16x8, u16, u16)
|
||||
OPENCV_HAL_IMPL_NEON_SELECT(v_int16x8, s16, u16)
|
||||
OPENCV_HAL_IMPL_NEON_SELECT(v_uint32x4, u32, u32)
|
||||
OPENCV_HAL_IMPL_NEON_SELECT(v_int32x4, s32, u32)
|
||||
OPENCV_HAL_IMPL_NEON_SELECT(v_float32x4, f32, u32)
|
||||
|
||||
#define OPENCV_HAL_IMPL_NEON_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix) \
|
||||
inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
|
||||
{ \
|
||||
b0.val = vmovl_##suffix(vget_low_##suffix(a.val)); \
|
||||
b1.val = vmovl_##suffix(vget_high_##suffix(a.val)); \
|
||||
} \
|
||||
inline _Tpwvec v_load_expand(const _Tp* ptr) \
|
||||
{ \
|
||||
return _Tpwvec(vmovl_##suffix(vld1_##suffix(ptr))); \
|
||||
}
|
||||
|
||||
OPENCV_HAL_IMPL_NEON_EXPAND(v_uint8x16, v_uint16x8, uchar, u8)
|
||||
OPENCV_HAL_IMPL_NEON_EXPAND(v_int8x16, v_int16x8, schar, s8)
|
||||
OPENCV_HAL_IMPL_NEON_EXPAND(v_uint16x8, v_uint32x4, ushort, u16)
|
||||
OPENCV_HAL_IMPL_NEON_EXPAND(v_int16x8, v_int32x4, short, s16)
|
||||
OPENCV_HAL_IMPL_NEON_EXPAND(v_uint32x4, v_uint64x2, uint, u32)
|
||||
OPENCV_HAL_IMPL_NEON_EXPAND(v_int32x4, v_int64x2, int, s32)
|
||||
|
||||
inline v_uint32x4 v_load_expand_q(const uchar* ptr)
|
||||
{
|
||||
uint8x8_t v0 = vcreate_u8(*(unsigned*)ptr);
|
||||
uint16x4_t v1 = vget_low_u16(vmovl_u8(v0));
|
||||
return v_uint32x4(vmovl_u16(v1));
|
||||
}
|
||||
|
||||
inline v_int32x4 v_load_expand_q(const schar* ptr)
|
||||
{
|
||||
int8x8_t v0 = vcreate_s8(*(unsigned*)ptr);
|
||||
int16x4_t v1 = vget_low_s16(vmovl_s8(v0));
|
||||
return v_int32x4(vmovl_s16(v1));
|
||||
}
|
||||
|
||||
#define OPENCV_HAL_IMPL_NEON_UNPACKS(_Tpvec, suffix) \
|
||||
inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
|
||||
{ \
|
||||
_Tpvec##x2_t p = vzipq_##suffix(a0.val, a1.val); \
|
||||
b0.val = p.val[0]; \
|
||||
b1.val = p.val[1]; \
|
||||
} \
|
||||
inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
|
||||
{ \
|
||||
return v_##_Tpvec(vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val))); \
|
||||
} \
|
||||
inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
|
||||
{ \
|
||||
return v_##_Tpvec(vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val))); \
|
||||
} \
|
||||
inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
|
||||
{ \
|
||||
c.val = vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val)); \
|
||||
d.val = vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val)); \
|
||||
}
|
||||
|
||||
OPENCV_HAL_IMPL_NEON_UNPACKS(uint8x16, u8)
|
||||
OPENCV_HAL_IMPL_NEON_UNPACKS(int8x16, s8)
|
||||
OPENCV_HAL_IMPL_NEON_UNPACKS(uint16x8, u16)
|
||||
OPENCV_HAL_IMPL_NEON_UNPACKS(int16x8, s16)
|
||||
OPENCV_HAL_IMPL_NEON_UNPACKS(uint32x4, u32)
|
||||
OPENCV_HAL_IMPL_NEON_UNPACKS(int32x4, s32)
|
||||
OPENCV_HAL_IMPL_NEON_UNPACKS(float32x4, f32)
|
||||
|
||||
#define OPENCV_HAL_IMPL_NEON_EXTRACT(_Tpvec, suffix) \
|
||||
template <int s> \
|
||||
inline v_##_Tpvec v_extract(const v_##_Tpvec& a, const v_##_Tpvec& b) \
|
||||
{ \
|
||||
return v_##_Tpvec(vextq_##suffix(a.val, b.val, s)); \
|
||||
}
|
||||
|
||||
OPENCV_HAL_IMPL_NEON_EXTRACT(uint8x16, u8)
|
||||
OPENCV_HAL_IMPL_NEON_EXTRACT(int8x16, s8)
|
||||
OPENCV_HAL_IMPL_NEON_EXTRACT(uint16x8, u16)
|
||||
OPENCV_HAL_IMPL_NEON_EXTRACT(int16x8, s16)
|
||||
OPENCV_HAL_IMPL_NEON_EXTRACT(uint32x4, u32)
|
||||
OPENCV_HAL_IMPL_NEON_EXTRACT(int32x4, s32)
|
||||
OPENCV_HAL_IMPL_NEON_EXTRACT(uint64x2, u64)
|
||||
OPENCV_HAL_IMPL_NEON_EXTRACT(int64x2, s64)
|
||||
OPENCV_HAL_IMPL_NEON_EXTRACT(float32x4, f32)
|
||||
|
||||
inline v_int32x4 v_round(const v_float32x4& a)
|
||||
{
|
||||
static const int32x4_t v_sign = vdupq_n_s32(1 << 31),
|
||||
v_05 = vreinterpretq_s32_f32(vdupq_n_f32(0.5f));
|
||||
|
||||
int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(a.val)));
|
||||
return v_int32x4(vcvtq_s32_f32(vaddq_f32(a.val, vreinterpretq_f32_s32(v_addition))));
|
||||
}
|
||||
|
||||
inline v_int32x4 v_floor(const v_float32x4& a)
|
||||
{
|
||||
int32x4_t a1 = vcvtq_s32_f32(a.val);
|
||||
uint32x4_t mask = vcgtq_f32(vcvtq_f32_s32(a1), a.val);
|
||||
return v_int32x4(vaddq_s32(a1, vreinterpretq_s32_u32(mask)));
|
||||
}
|
||||
|
||||
inline v_int32x4 v_ceil(const v_float32x4& a)
|
||||
{
|
||||
int32x4_t a1 = vcvtq_s32_f32(a.val);
|
||||
uint32x4_t mask = vcgtq_f32(a.val, vcvtq_f32_s32(a1));
|
||||
return v_int32x4(vsubq_s32(a1, vreinterpretq_s32_u32(mask)));
|
||||
}
|
||||
|
||||
inline v_int32x4 v_trunc(const v_float32x4& a)
|
||||
{ return v_int32x4(vcvtq_s32_f32(a.val)); }
|
||||
|
||||
#define OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(_Tpvec, suffix) \
|
||||
inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
|
||||
const v_##_Tpvec& a2, const v_##_Tpvec& a3, \
|
||||
v_##_Tpvec& b0, v_##_Tpvec& b1, \
|
||||
v_##_Tpvec& b2, v_##_Tpvec& b3) \
|
||||
{ \
|
||||
/* m00 m01 m02 m03 */ \
|
||||
/* m10 m11 m12 m13 */ \
|
||||
/* m20 m21 m22 m23 */ \
|
||||
/* m30 m31 m32 m33 */ \
|
||||
_Tpvec##x2_t t0 = vtrnq_##suffix(a0.val, a1.val); \
|
||||
_Tpvec##x2_t t1 = vtrnq_##suffix(a2.val, a3.val); \
|
||||
/* m00 m10 m02 m12 */ \
|
||||
/* m01 m11 m03 m13 */ \
|
||||
/* m20 m30 m22 m32 */ \
|
||||
/* m21 m31 m23 m33 */ \
|
||||
b0.val = vcombine_##suffix(vget_low_##suffix(t0.val[0]), vget_low_##suffix(t1.val[0])); \
|
||||
b1.val = vcombine_##suffix(vget_low_##suffix(t0.val[1]), vget_low_##suffix(t1.val[1])); \
|
||||
b2.val = vcombine_##suffix(vget_high_##suffix(t0.val[0]), vget_high_##suffix(t1.val[0])); \
|
||||
b3.val = vcombine_##suffix(vget_high_##suffix(t0.val[1]), vget_high_##suffix(t1.val[1])); \
|
||||
}
|
||||
|
||||
OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(uint32x4, u32)
|
||||
OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s32)
|
||||
OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f32)
|
||||
|
||||
#define OPENCV_HAL_IMPL_NEON_INTERLEAVED(_Tpvec, _Tp, suffix) \
|
||||
inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
|
||||
{ \
|
||||
_Tpvec##x3_t v = vld3q_##suffix(ptr); \
|
||||
a.val = v.val[0]; \
|
||||
b.val = v.val[1]; \
|
||||
c.val = v.val[2]; \
|
||||
} \
|
||||
inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
|
||||
v_##_Tpvec& c, v_##_Tpvec& d) \
|
||||
{ \
|
||||
_Tpvec##x4_t v = vld4q_##suffix(ptr); \
|
||||
a.val = v.val[0]; \
|
||||
b.val = v.val[1]; \
|
||||
c.val = v.val[2]; \
|
||||
d.val = v.val[3]; \
|
||||
} \
|
||||
inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, const v_##_Tpvec& c) \
|
||||
{ \
|
||||
_Tpvec##x3_t v; \
|
||||
v.val[0] = a.val; \
|
||||
v.val[1] = b.val; \
|
||||
v.val[2] = c.val; \
|
||||
vst3q_##suffix(ptr, v); \
|
||||
} \
|
||||
inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
|
||||
const v_##_Tpvec& c, const v_##_Tpvec& d) \
|
||||
{ \
|
||||
_Tpvec##x4_t v; \
|
||||
v.val[0] = a.val; \
|
||||
v.val[1] = b.val; \
|
||||
v.val[2] = c.val; \
|
||||
v.val[3] = d.val; \
|
||||
vst4q_##suffix(ptr, v); \
|
||||
}
|
||||
|
||||
OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint8x16, uchar, u8)
|
||||
OPENCV_HAL_IMPL_NEON_INTERLEAVED(int8x16, schar, s8)
|
||||
OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint16x8, ushort, u16)
|
||||
OPENCV_HAL_IMPL_NEON_INTERLEAVED(int16x8, short, s16)
|
||||
OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint32x4, unsigned, u32)
|
||||
OPENCV_HAL_IMPL_NEON_INTERLEAVED(int32x4, int, s32)
|
||||
OPENCV_HAL_IMPL_NEON_INTERLEAVED(float32x4, float, f32)
|
||||
|
||||
inline v_float32x4 v_cvt_f32(const v_int32x4& a)
|
||||
{
|
||||
return v_float32x4(vcvtq_f32_s32(a.val));
|
||||
}
|
||||
|
||||
//! @endcond
|
||||
|
||||
}
|
||||
|
||||
#endif
|
1599
modules/core/include/opencv2/core/hal/intrin_sse.hpp
Normal file
1599
modules/core/include/opencv2/core/hal/intrin_sse.hpp
Normal file
File diff suppressed because it is too large
Load Diff
@@ -51,6 +51,7 @@
|
||||
#include "opencv2/core/cvdef.h"
|
||||
#include "opencv2/core/base.hpp"
|
||||
#include "opencv2/core/traits.hpp"
|
||||
#include "opencv2/core/saturate.hpp"
|
||||
|
||||
namespace cv
|
||||
{
|
||||
|
128
modules/core/include/opencv2/core/neon_utils.hpp
Normal file
128
modules/core/include/opencv2/core/neon_utils.hpp
Normal file
@@ -0,0 +1,128 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2015, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef __OPENCV_HAL_NEON_UTILS_HPP__
|
||||
#define __OPENCV_HAL_NEON_UTILS_HPP__
|
||||
|
||||
#include "opencv2/core/cvdef.h"
|
||||
|
||||
//! @addtogroup core_utils_neon
|
||||
//! @{
|
||||
|
||||
#if CV_NEON
|
||||
|
||||
inline int32x2_t cv_vrnd_s32_f32(float32x2_t v)
|
||||
{
|
||||
static int32x2_t v_sign = vdup_n_s32(1 << 31),
|
||||
v_05 = vreinterpret_s32_f32(vdup_n_f32(0.5f));
|
||||
|
||||
int32x2_t v_addition = vorr_s32(v_05, vand_s32(v_sign, vreinterpret_s32_f32(v)));
|
||||
return vcvt_s32_f32(vadd_f32(v, vreinterpret_f32_s32(v_addition)));
|
||||
}
|
||||
|
||||
inline int32x4_t cv_vrndq_s32_f32(float32x4_t v)
|
||||
{
|
||||
static int32x4_t v_sign = vdupq_n_s32(1 << 31),
|
||||
v_05 = vreinterpretq_s32_f32(vdupq_n_f32(0.5f));
|
||||
|
||||
int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(v)));
|
||||
return vcvtq_s32_f32(vaddq_f32(v, vreinterpretq_f32_s32(v_addition)));
|
||||
}
|
||||
|
||||
inline uint32x2_t cv_vrnd_u32_f32(float32x2_t v)
|
||||
{
|
||||
static float32x2_t v_05 = vdup_n_f32(0.5f);
|
||||
return vcvt_u32_f32(vadd_f32(v, v_05));
|
||||
}
|
||||
|
||||
inline uint32x4_t cv_vrndq_u32_f32(float32x4_t v)
|
||||
{
|
||||
static float32x4_t v_05 = vdupq_n_f32(0.5f);
|
||||
return vcvtq_u32_f32(vaddq_f32(v, v_05));
|
||||
}
|
||||
|
||||
inline float32x4_t cv_vrecpq_f32(float32x4_t val)
|
||||
{
|
||||
float32x4_t reciprocal = vrecpeq_f32(val);
|
||||
reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
|
||||
reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
|
||||
return reciprocal;
|
||||
}
|
||||
|
||||
inline float32x2_t cv_vrecp_f32(float32x2_t val)
|
||||
{
|
||||
float32x2_t reciprocal = vrecpe_f32(val);
|
||||
reciprocal = vmul_f32(vrecps_f32(val, reciprocal), reciprocal);
|
||||
reciprocal = vmul_f32(vrecps_f32(val, reciprocal), reciprocal);
|
||||
return reciprocal;
|
||||
}
|
||||
|
||||
inline float32x4_t cv_vrsqrtq_f32(float32x4_t val)
|
||||
{
|
||||
float32x4_t e = vrsqrteq_f32(val);
|
||||
e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e);
|
||||
e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e);
|
||||
return e;
|
||||
}
|
||||
|
||||
inline float32x2_t cv_vrsqrt_f32(float32x2_t val)
|
||||
{
|
||||
float32x2_t e = vrsqrte_f32(val);
|
||||
e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), val), e);
|
||||
e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), val), e);
|
||||
return e;
|
||||
}
|
||||
|
||||
inline float32x4_t cv_vsqrtq_f32(float32x4_t val)
|
||||
{
|
||||
return cv_vrecpq_f32(cv_vrsqrtq_f32(val));
|
||||
}
|
||||
|
||||
inline float32x2_t cv_vsqrt_f32(float32x2_t val)
|
||||
{
|
||||
return cv_vrecp_f32(cv_vrsqrt_f32(val));
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
//! @}
|
||||
|
||||
#endif // __OPENCV_HAL_NEON_UTILS_HPP__
|
150
modules/core/include/opencv2/core/saturate.hpp
Normal file
150
modules/core/include/opencv2/core/saturate.hpp
Normal file
@@ -0,0 +1,150 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
|
||||
// Copyright (C) 2014, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef __OPENCV_CORE_SATURATE_HPP__
|
||||
#define __OPENCV_CORE_SATURATE_HPP__
|
||||
|
||||
#include "opencv2/core/cvdef.h"
|
||||
#include "opencv2/core/fast_math.hpp"
|
||||
|
||||
namespace cv
|
||||
{
|
||||
|
||||
//! @addtogroup core_utils
|
||||
//! @{
|
||||
|
||||
/////////////// saturate_cast (used in image & signal processing) ///////////////////
|
||||
|
||||
/** @brief Template function for accurate conversion from one primitive type to another.
|
||||
|
||||
The functions saturate_cast resemble the standard C++ cast operations, such as static_cast\<T\>()
|
||||
and others. They perform an efficient and accurate conversion from one primitive type to another
|
||||
(see the introduction chapter). saturate in the name means that when the input value v is out of the
|
||||
range of the target type, the result is not formed just by taking low bits of the input, but instead
|
||||
the value is clipped. For example:
|
||||
@code
|
||||
uchar a = saturate_cast<uchar>(-100); // a = 0 (UCHAR_MIN)
|
||||
short b = saturate_cast<short>(33333.33333); // b = 32767 (SHRT_MAX)
|
||||
@endcode
|
||||
Such clipping is done when the target type is unsigned char , signed char , unsigned short or
|
||||
signed short . For 32-bit integers, no clipping is done.
|
||||
|
||||
When the parameter is a floating-point value and the target type is an integer (8-, 16- or 32-bit),
|
||||
the floating-point value is first rounded to the nearest integer and then clipped if needed (when
|
||||
the target type is 8- or 16-bit).
|
||||
|
||||
This operation is used in the simplest or most complex image processing functions in OpenCV.
|
||||
|
||||
@param v Function parameter.
|
||||
@sa add, subtract, multiply, divide, Mat::convertTo
|
||||
*/
|
||||
template<typename _Tp> static inline _Tp saturate_cast(uchar v) { return _Tp(v); }
|
||||
/** @overload */
|
||||
template<typename _Tp> static inline _Tp saturate_cast(schar v) { return _Tp(v); }
|
||||
/** @overload */
|
||||
template<typename _Tp> static inline _Tp saturate_cast(ushort v) { return _Tp(v); }
|
||||
/** @overload */
|
||||
template<typename _Tp> static inline _Tp saturate_cast(short v) { return _Tp(v); }
|
||||
/** @overload */
|
||||
template<typename _Tp> static inline _Tp saturate_cast(unsigned v) { return _Tp(v); }
|
||||
/** @overload */
|
||||
template<typename _Tp> static inline _Tp saturate_cast(int v) { return _Tp(v); }
|
||||
/** @overload */
|
||||
template<typename _Tp> static inline _Tp saturate_cast(float v) { return _Tp(v); }
|
||||
/** @overload */
|
||||
template<typename _Tp> static inline _Tp saturate_cast(double v) { return _Tp(v); }
|
||||
/** @overload */
|
||||
template<typename _Tp> static inline _Tp saturate_cast(int64 v) { return _Tp(v); }
|
||||
/** @overload */
|
||||
template<typename _Tp> static inline _Tp saturate_cast(uint64 v) { return _Tp(v); }
|
||||
|
||||
template<> inline uchar saturate_cast<uchar>(schar v) { return (uchar)std::max((int)v, 0); }
|
||||
template<> inline uchar saturate_cast<uchar>(ushort v) { return (uchar)std::min((unsigned)v, (unsigned)UCHAR_MAX); }
|
||||
template<> inline uchar saturate_cast<uchar>(int v) { return (uchar)((unsigned)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
|
||||
template<> inline uchar saturate_cast<uchar>(short v) { return saturate_cast<uchar>((int)v); }
|
||||
template<> inline uchar saturate_cast<uchar>(unsigned v) { return (uchar)std::min(v, (unsigned)UCHAR_MAX); }
|
||||
template<> inline uchar saturate_cast<uchar>(float v) { int iv = cvRound(v); return saturate_cast<uchar>(iv); }
|
||||
template<> inline uchar saturate_cast<uchar>(double v) { int iv = cvRound(v); return saturate_cast<uchar>(iv); }
|
||||
template<> inline uchar saturate_cast<uchar>(int64 v) { return (uchar)((uint64)v <= (uint64)UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
|
||||
template<> inline uchar saturate_cast<uchar>(uint64 v) { return (uchar)std::min(v, (uint64)UCHAR_MAX); }
|
||||
|
||||
template<> inline schar saturate_cast<schar>(uchar v) { return (schar)std::min((int)v, SCHAR_MAX); }
|
||||
template<> inline schar saturate_cast<schar>(ushort v) { return (schar)std::min((unsigned)v, (unsigned)SCHAR_MAX); }
|
||||
template<> inline schar saturate_cast<schar>(int v) { return (schar)((unsigned)(v-SCHAR_MIN) <= (unsigned)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); }
|
||||
template<> inline schar saturate_cast<schar>(short v) { return saturate_cast<schar>((int)v); }
|
||||
template<> inline schar saturate_cast<schar>(unsigned v) { return (schar)std::min(v, (unsigned)SCHAR_MAX); }
|
||||
template<> inline schar saturate_cast<schar>(float v) { int iv = cvRound(v); return saturate_cast<schar>(iv); }
|
||||
template<> inline schar saturate_cast<schar>(double v) { int iv = cvRound(v); return saturate_cast<schar>(iv); }
|
||||
template<> inline schar saturate_cast<schar>(int64 v) { return (schar)((uint64)((int64)v-SCHAR_MIN) <= (uint64)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); }
|
||||
template<> inline schar saturate_cast<schar>(uint64 v) { return (schar)std::min(v, (uint64)SCHAR_MAX); }
|
||||
|
||||
template<> inline ushort saturate_cast<ushort>(schar v) { return (ushort)std::max((int)v, 0); }
|
||||
template<> inline ushort saturate_cast<ushort>(short v) { return (ushort)std::max((int)v, 0); }
|
||||
template<> inline ushort saturate_cast<ushort>(int v) { return (ushort)((unsigned)v <= (unsigned)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
|
||||
template<> inline ushort saturate_cast<ushort>(unsigned v) { return (ushort)std::min(v, (unsigned)USHRT_MAX); }
|
||||
template<> inline ushort saturate_cast<ushort>(float v) { int iv = cvRound(v); return saturate_cast<ushort>(iv); }
|
||||
template<> inline ushort saturate_cast<ushort>(double v) { int iv = cvRound(v); return saturate_cast<ushort>(iv); }
|
||||
template<> inline ushort saturate_cast<ushort>(int64 v) { return (ushort)((uint64)v <= (uint64)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
|
||||
template<> inline ushort saturate_cast<ushort>(uint64 v) { return (ushort)std::min(v, (uint64)USHRT_MAX); }
|
||||
|
||||
template<> inline short saturate_cast<short>(ushort v) { return (short)std::min((int)v, SHRT_MAX); }
|
||||
template<> inline short saturate_cast<short>(int v) { return (short)((unsigned)(v - SHRT_MIN) <= (unsigned)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); }
|
||||
template<> inline short saturate_cast<short>(unsigned v) { return (short)std::min(v, (unsigned)SHRT_MAX); }
|
||||
template<> inline short saturate_cast<short>(float v) { int iv = cvRound(v); return saturate_cast<short>(iv); }
|
||||
template<> inline short saturate_cast<short>(double v) { int iv = cvRound(v); return saturate_cast<short>(iv); }
|
||||
template<> inline short saturate_cast<short>(int64 v) { return (short)((uint64)((int64)v - SHRT_MIN) <= (uint64)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); }
|
||||
template<> inline short saturate_cast<short>(uint64 v) { return (short)std::min(v, (uint64)SHRT_MAX); }
|
||||
|
||||
template<> inline int saturate_cast<int>(float v) { return cvRound(v); }
|
||||
template<> inline int saturate_cast<int>(double v) { return cvRound(v); }
|
||||
|
||||
// we intentionally do not clip negative numbers, to make -1 become 0xffffffff etc.
|
||||
template<> inline unsigned saturate_cast<unsigned>(float v) { return cvRound(v); }
|
||||
template<> inline unsigned saturate_cast<unsigned>(double v) { return cvRound(v); }
|
||||
|
||||
//! @}
|
||||
|
||||
} // cv
|
||||
|
||||
#endif // __OPENCV_CORE_SATURATE_HPP__
|
652
modules/core/include/opencv2/core/sse_utils.hpp
Normal file
652
modules/core/include/opencv2/core/sse_utils.hpp
Normal file
@@ -0,0 +1,652 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2015, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef __OPENCV_CORE_SSE_UTILS_HPP__
|
||||
#define __OPENCV_CORE_SSE_UTILS_HPP__
|
||||
|
||||
#ifndef __cplusplus
|
||||
# error sse_utils.hpp header must be compiled as C++
|
||||
#endif
|
||||
|
||||
#include "opencv2/core/cvdef.h"
|
||||
|
||||
//! @addtogroup core_utils_sse
|
||||
//! @{
|
||||
|
||||
#if CV_SSE2
|
||||
|
||||
inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
|
||||
{
|
||||
__m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_g0);
|
||||
__m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_g0);
|
||||
__m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_g1);
|
||||
__m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_g1);
|
||||
|
||||
__m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk2);
|
||||
__m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk2);
|
||||
__m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk3);
|
||||
__m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk3);
|
||||
|
||||
__m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk2);
|
||||
__m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk2);
|
||||
__m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk3);
|
||||
__m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk3);
|
||||
|
||||
__m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk2);
|
||||
__m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk2);
|
||||
__m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk3);
|
||||
__m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk3);
|
||||
|
||||
v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk2);
|
||||
v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk2);
|
||||
v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk3);
|
||||
v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk3);
|
||||
}
|
||||
|
||||
inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
|
||||
__m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
|
||||
{
|
||||
__m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_g1);
|
||||
__m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_g1);
|
||||
__m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_b0);
|
||||
__m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_b0);
|
||||
__m128i layer1_chunk4 = _mm_unpacklo_epi8(v_g0, v_b1);
|
||||
__m128i layer1_chunk5 = _mm_unpackhi_epi8(v_g0, v_b1);
|
||||
|
||||
__m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk3);
|
||||
__m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk3);
|
||||
__m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk4);
|
||||
__m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk4);
|
||||
__m128i layer2_chunk4 = _mm_unpacklo_epi8(layer1_chunk2, layer1_chunk5);
|
||||
__m128i layer2_chunk5 = _mm_unpackhi_epi8(layer1_chunk2, layer1_chunk5);
|
||||
|
||||
__m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk3);
|
||||
__m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk3);
|
||||
__m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk4);
|
||||
__m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk4);
|
||||
__m128i layer3_chunk4 = _mm_unpacklo_epi8(layer2_chunk2, layer2_chunk5);
|
||||
__m128i layer3_chunk5 = _mm_unpackhi_epi8(layer2_chunk2, layer2_chunk5);
|
||||
|
||||
__m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk3);
|
||||
__m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk3);
|
||||
__m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk4);
|
||||
__m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk4);
|
||||
__m128i layer4_chunk4 = _mm_unpacklo_epi8(layer3_chunk2, layer3_chunk5);
|
||||
__m128i layer4_chunk5 = _mm_unpackhi_epi8(layer3_chunk2, layer3_chunk5);
|
||||
|
||||
v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk3);
|
||||
v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk3);
|
||||
v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk4);
|
||||
v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk4);
|
||||
v_b0 = _mm_unpacklo_epi8(layer4_chunk2, layer4_chunk5);
|
||||
v_b1 = _mm_unpackhi_epi8(layer4_chunk2, layer4_chunk5);
|
||||
}
|
||||
|
||||
inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
|
||||
__m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
|
||||
{
|
||||
__m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_b0);
|
||||
__m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_b0);
|
||||
__m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_b1);
|
||||
__m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_b1);
|
||||
__m128i layer1_chunk4 = _mm_unpacklo_epi8(v_g0, v_a0);
|
||||
__m128i layer1_chunk5 = _mm_unpackhi_epi8(v_g0, v_a0);
|
||||
__m128i layer1_chunk6 = _mm_unpacklo_epi8(v_g1, v_a1);
|
||||
__m128i layer1_chunk7 = _mm_unpackhi_epi8(v_g1, v_a1);
|
||||
|
||||
__m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk4);
|
||||
__m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk4);
|
||||
__m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk5);
|
||||
__m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk5);
|
||||
__m128i layer2_chunk4 = _mm_unpacklo_epi8(layer1_chunk2, layer1_chunk6);
|
||||
__m128i layer2_chunk5 = _mm_unpackhi_epi8(layer1_chunk2, layer1_chunk6);
|
||||
__m128i layer2_chunk6 = _mm_unpacklo_epi8(layer1_chunk3, layer1_chunk7);
|
||||
__m128i layer2_chunk7 = _mm_unpackhi_epi8(layer1_chunk3, layer1_chunk7);
|
||||
|
||||
__m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk4);
|
||||
__m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk4);
|
||||
__m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk5);
|
||||
__m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk5);
|
||||
__m128i layer3_chunk4 = _mm_unpacklo_epi8(layer2_chunk2, layer2_chunk6);
|
||||
__m128i layer3_chunk5 = _mm_unpackhi_epi8(layer2_chunk2, layer2_chunk6);
|
||||
__m128i layer3_chunk6 = _mm_unpacklo_epi8(layer2_chunk3, layer2_chunk7);
|
||||
__m128i layer3_chunk7 = _mm_unpackhi_epi8(layer2_chunk3, layer2_chunk7);
|
||||
|
||||
__m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk4);
|
||||
__m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk4);
|
||||
__m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk5);
|
||||
__m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk5);
|
||||
__m128i layer4_chunk4 = _mm_unpacklo_epi8(layer3_chunk2, layer3_chunk6);
|
||||
__m128i layer4_chunk5 = _mm_unpackhi_epi8(layer3_chunk2, layer3_chunk6);
|
||||
__m128i layer4_chunk6 = _mm_unpacklo_epi8(layer3_chunk3, layer3_chunk7);
|
||||
__m128i layer4_chunk7 = _mm_unpackhi_epi8(layer3_chunk3, layer3_chunk7);
|
||||
|
||||
v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk4);
|
||||
v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk4);
|
||||
v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk5);
|
||||
v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk5);
|
||||
v_b0 = _mm_unpacklo_epi8(layer4_chunk2, layer4_chunk6);
|
||||
v_b1 = _mm_unpackhi_epi8(layer4_chunk2, layer4_chunk6);
|
||||
v_a0 = _mm_unpacklo_epi8(layer4_chunk3, layer4_chunk7);
|
||||
v_a1 = _mm_unpackhi_epi8(layer4_chunk3, layer4_chunk7);
|
||||
}
|
||||
|
||||
inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
|
||||
{
|
||||
__m128i v_mask = _mm_set1_epi16(0x00ff);
|
||||
|
||||
__m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
|
||||
__m128i layer4_chunk2 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8));
|
||||
__m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
|
||||
__m128i layer4_chunk3 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8));
|
||||
|
||||
__m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask));
|
||||
__m128i layer3_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8));
|
||||
__m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask));
|
||||
__m128i layer3_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8));
|
||||
|
||||
__m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
|
||||
__m128i layer2_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8));
|
||||
__m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
|
||||
__m128i layer2_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8));
|
||||
|
||||
__m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
|
||||
__m128i layer1_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8));
|
||||
__m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
|
||||
__m128i layer1_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8));
|
||||
|
||||
v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
|
||||
v_g0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8));
|
||||
v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
|
||||
v_g1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8));
|
||||
}
|
||||
|
||||
inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
|
||||
__m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
|
||||
{
|
||||
__m128i v_mask = _mm_set1_epi16(0x00ff);
|
||||
|
||||
__m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
|
||||
__m128i layer4_chunk3 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8));
|
||||
__m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
|
||||
__m128i layer4_chunk4 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8));
|
||||
__m128i layer4_chunk2 = _mm_packus_epi16(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
|
||||
__m128i layer4_chunk5 = _mm_packus_epi16(_mm_srli_epi16(v_b0, 8), _mm_srli_epi16(v_b1, 8));
|
||||
|
||||
__m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask));
|
||||
__m128i layer3_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8));
|
||||
__m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask));
|
||||
__m128i layer3_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8));
|
||||
__m128i layer3_chunk2 = _mm_packus_epi16(_mm_and_si128(layer4_chunk4, v_mask), _mm_and_si128(layer4_chunk5, v_mask));
|
||||
__m128i layer3_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk4, 8), _mm_srli_epi16(layer4_chunk5, 8));
|
||||
|
||||
__m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
|
||||
__m128i layer2_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8));
|
||||
__m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
|
||||
__m128i layer2_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8));
|
||||
__m128i layer2_chunk2 = _mm_packus_epi16(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
|
||||
__m128i layer2_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk4, 8), _mm_srli_epi16(layer3_chunk5, 8));
|
||||
|
||||
__m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
|
||||
__m128i layer1_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8));
|
||||
__m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
|
||||
__m128i layer1_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8));
|
||||
__m128i layer1_chunk2 = _mm_packus_epi16(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
|
||||
__m128i layer1_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk4, 8), _mm_srli_epi16(layer2_chunk5, 8));
|
||||
|
||||
v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
|
||||
v_g1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8));
|
||||
v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
|
||||
v_b0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8));
|
||||
v_g0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
|
||||
v_b1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk4, 8), _mm_srli_epi16(layer1_chunk5, 8));
|
||||
}
|
||||
|
||||
inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
|
||||
__m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
|
||||
{
|
||||
__m128i v_mask = _mm_set1_epi16(0x00ff);
|
||||
|
||||
__m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
|
||||
__m128i layer4_chunk4 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8));
|
||||
__m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
|
||||
__m128i layer4_chunk5 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8));
|
||||
__m128i layer4_chunk2 = _mm_packus_epi16(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
|
||||
__m128i layer4_chunk6 = _mm_packus_epi16(_mm_srli_epi16(v_b0, 8), _mm_srli_epi16(v_b1, 8));
|
||||
__m128i layer4_chunk3 = _mm_packus_epi16(_mm_and_si128(v_a0, v_mask), _mm_and_si128(v_a1, v_mask));
|
||||
__m128i layer4_chunk7 = _mm_packus_epi16(_mm_srli_epi16(v_a0, 8), _mm_srli_epi16(v_a1, 8));
|
||||
|
||||
__m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask));
|
||||
__m128i layer3_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8));
|
||||
__m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask));
|
||||
__m128i layer3_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8));
|
||||
__m128i layer3_chunk2 = _mm_packus_epi16(_mm_and_si128(layer4_chunk4, v_mask), _mm_and_si128(layer4_chunk5, v_mask));
|
||||
__m128i layer3_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk4, 8), _mm_srli_epi16(layer4_chunk5, 8));
|
||||
__m128i layer3_chunk3 = _mm_packus_epi16(_mm_and_si128(layer4_chunk6, v_mask), _mm_and_si128(layer4_chunk7, v_mask));
|
||||
__m128i layer3_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk6, 8), _mm_srli_epi16(layer4_chunk7, 8));
|
||||
|
||||
__m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
|
||||
__m128i layer2_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8));
|
||||
__m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
|
||||
__m128i layer2_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8));
|
||||
__m128i layer2_chunk2 = _mm_packus_epi16(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
|
||||
__m128i layer2_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk4, 8), _mm_srli_epi16(layer3_chunk5, 8));
|
||||
__m128i layer2_chunk3 = _mm_packus_epi16(_mm_and_si128(layer3_chunk6, v_mask), _mm_and_si128(layer3_chunk7, v_mask));
|
||||
__m128i layer2_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk6, 8), _mm_srli_epi16(layer3_chunk7, 8));
|
||||
|
||||
__m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
|
||||
__m128i layer1_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8));
|
||||
__m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
|
||||
__m128i layer1_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8));
|
||||
__m128i layer1_chunk2 = _mm_packus_epi16(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
|
||||
__m128i layer1_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk4, 8), _mm_srli_epi16(layer2_chunk5, 8));
|
||||
__m128i layer1_chunk3 = _mm_packus_epi16(_mm_and_si128(layer2_chunk6, v_mask), _mm_and_si128(layer2_chunk7, v_mask));
|
||||
__m128i layer1_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk6, 8), _mm_srli_epi16(layer2_chunk7, 8));
|
||||
|
||||
v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
|
||||
v_b0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8));
|
||||
v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
|
||||
v_b1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8));
|
||||
v_g0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
|
||||
v_a0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk4, 8), _mm_srli_epi16(layer1_chunk5, 8));
|
||||
v_g1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk6, v_mask), _mm_and_si128(layer1_chunk7, v_mask));
|
||||
v_a1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk6, 8), _mm_srli_epi16(layer1_chunk7, 8));
|
||||
}
|
||||
|
||||
inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
|
||||
{
|
||||
__m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_g0);
|
||||
__m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_g0);
|
||||
__m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_g1);
|
||||
__m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_g1);
|
||||
|
||||
__m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk2);
|
||||
__m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk2);
|
||||
__m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk3);
|
||||
__m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk3);
|
||||
|
||||
__m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk2);
|
||||
__m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk2);
|
||||
__m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk3);
|
||||
__m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk3);
|
||||
|
||||
v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk2);
|
||||
v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk2);
|
||||
v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk3);
|
||||
v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk3);
|
||||
}
|
||||
|
||||
inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
|
||||
__m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
|
||||
{
|
||||
__m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_g1);
|
||||
__m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_g1);
|
||||
__m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_b0);
|
||||
__m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_b0);
|
||||
__m128i layer1_chunk4 = _mm_unpacklo_epi16(v_g0, v_b1);
|
||||
__m128i layer1_chunk5 = _mm_unpackhi_epi16(v_g0, v_b1);
|
||||
|
||||
__m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk3);
|
||||
__m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk3);
|
||||
__m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk4);
|
||||
__m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk4);
|
||||
__m128i layer2_chunk4 = _mm_unpacklo_epi16(layer1_chunk2, layer1_chunk5);
|
||||
__m128i layer2_chunk5 = _mm_unpackhi_epi16(layer1_chunk2, layer1_chunk5);
|
||||
|
||||
__m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk3);
|
||||
__m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk3);
|
||||
__m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk4);
|
||||
__m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk4);
|
||||
__m128i layer3_chunk4 = _mm_unpacklo_epi16(layer2_chunk2, layer2_chunk5);
|
||||
__m128i layer3_chunk5 = _mm_unpackhi_epi16(layer2_chunk2, layer2_chunk5);
|
||||
|
||||
v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk3);
|
||||
v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk3);
|
||||
v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk4);
|
||||
v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk4);
|
||||
v_b0 = _mm_unpacklo_epi16(layer3_chunk2, layer3_chunk5);
|
||||
v_b1 = _mm_unpackhi_epi16(layer3_chunk2, layer3_chunk5);
|
||||
}
|
||||
|
||||
inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
|
||||
__m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
|
||||
{
|
||||
__m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_b0);
|
||||
__m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_b0);
|
||||
__m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_b1);
|
||||
__m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_b1);
|
||||
__m128i layer1_chunk4 = _mm_unpacklo_epi16(v_g0, v_a0);
|
||||
__m128i layer1_chunk5 = _mm_unpackhi_epi16(v_g0, v_a0);
|
||||
__m128i layer1_chunk6 = _mm_unpacklo_epi16(v_g1, v_a1);
|
||||
__m128i layer1_chunk7 = _mm_unpackhi_epi16(v_g1, v_a1);
|
||||
|
||||
__m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk4);
|
||||
__m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk4);
|
||||
__m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk5);
|
||||
__m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk5);
|
||||
__m128i layer2_chunk4 = _mm_unpacklo_epi16(layer1_chunk2, layer1_chunk6);
|
||||
__m128i layer2_chunk5 = _mm_unpackhi_epi16(layer1_chunk2, layer1_chunk6);
|
||||
__m128i layer2_chunk6 = _mm_unpacklo_epi16(layer1_chunk3, layer1_chunk7);
|
||||
__m128i layer2_chunk7 = _mm_unpackhi_epi16(layer1_chunk3, layer1_chunk7);
|
||||
|
||||
__m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk4);
|
||||
__m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk4);
|
||||
__m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk5);
|
||||
__m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk5);
|
||||
__m128i layer3_chunk4 = _mm_unpacklo_epi16(layer2_chunk2, layer2_chunk6);
|
||||
__m128i layer3_chunk5 = _mm_unpackhi_epi16(layer2_chunk2, layer2_chunk6);
|
||||
__m128i layer3_chunk6 = _mm_unpacklo_epi16(layer2_chunk3, layer2_chunk7);
|
||||
__m128i layer3_chunk7 = _mm_unpackhi_epi16(layer2_chunk3, layer2_chunk7);
|
||||
|
||||
v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk4);
|
||||
v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk4);
|
||||
v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk5);
|
||||
v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk5);
|
||||
v_b0 = _mm_unpacklo_epi16(layer3_chunk2, layer3_chunk6);
|
||||
v_b1 = _mm_unpackhi_epi16(layer3_chunk2, layer3_chunk6);
|
||||
v_a0 = _mm_unpacklo_epi16(layer3_chunk3, layer3_chunk7);
|
||||
v_a1 = _mm_unpackhi_epi16(layer3_chunk3, layer3_chunk7);
|
||||
}
|
||||
|
||||
#if CV_SSE4_1
|
||||
|
||||
inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
|
||||
{
|
||||
__m128i v_mask = _mm_set1_epi32(0x0000ffff);
|
||||
|
||||
__m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
|
||||
__m128i layer3_chunk2 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16));
|
||||
__m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
|
||||
__m128i layer3_chunk3 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16));
|
||||
|
||||
__m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
|
||||
__m128i layer2_chunk2 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16));
|
||||
__m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
|
||||
__m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16));
|
||||
|
||||
__m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
|
||||
__m128i layer1_chunk2 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16));
|
||||
__m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
|
||||
__m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16));
|
||||
|
||||
v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
|
||||
v_g0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16));
|
||||
v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
|
||||
v_g1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16));
|
||||
}
|
||||
|
||||
inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
|
||||
__m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
|
||||
{
|
||||
__m128i v_mask = _mm_set1_epi32(0x0000ffff);
|
||||
|
||||
__m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
|
||||
__m128i layer3_chunk3 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16));
|
||||
__m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
|
||||
__m128i layer3_chunk4 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16));
|
||||
__m128i layer3_chunk2 = _mm_packus_epi32(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
|
||||
__m128i layer3_chunk5 = _mm_packus_epi32(_mm_srli_epi32(v_b0, 16), _mm_srli_epi32(v_b1, 16));
|
||||
|
||||
__m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
|
||||
__m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16));
|
||||
__m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
|
||||
__m128i layer2_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16));
|
||||
__m128i layer2_chunk2 = _mm_packus_epi32(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
|
||||
__m128i layer2_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk4, 16), _mm_srli_epi32(layer3_chunk5, 16));
|
||||
|
||||
__m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
|
||||
__m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16));
|
||||
__m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
|
||||
__m128i layer1_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16));
|
||||
__m128i layer1_chunk2 = _mm_packus_epi32(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
|
||||
__m128i layer1_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk4, 16), _mm_srli_epi32(layer2_chunk5, 16));
|
||||
|
||||
v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
|
||||
v_g1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16));
|
||||
v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
|
||||
v_b0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16));
|
||||
v_g0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
|
||||
v_b1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk4, 16), _mm_srli_epi32(layer1_chunk5, 16));
|
||||
}
|
||||
|
||||
inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
|
||||
__m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
|
||||
{
|
||||
__m128i v_mask = _mm_set1_epi32(0x0000ffff);
|
||||
|
||||
__m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
|
||||
__m128i layer3_chunk4 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16));
|
||||
__m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
|
||||
__m128i layer3_chunk5 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16));
|
||||
__m128i layer3_chunk2 = _mm_packus_epi32(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
|
||||
__m128i layer3_chunk6 = _mm_packus_epi32(_mm_srli_epi32(v_b0, 16), _mm_srli_epi32(v_b1, 16));
|
||||
__m128i layer3_chunk3 = _mm_packus_epi32(_mm_and_si128(v_a0, v_mask), _mm_and_si128(v_a1, v_mask));
|
||||
__m128i layer3_chunk7 = _mm_packus_epi32(_mm_srli_epi32(v_a0, 16), _mm_srli_epi32(v_a1, 16));
|
||||
|
||||
__m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
|
||||
__m128i layer2_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16));
|
||||
__m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
|
||||
__m128i layer2_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16));
|
||||
__m128i layer2_chunk2 = _mm_packus_epi32(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
|
||||
__m128i layer2_chunk6 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk4, 16), _mm_srli_epi32(layer3_chunk5, 16));
|
||||
__m128i layer2_chunk3 = _mm_packus_epi32(_mm_and_si128(layer3_chunk6, v_mask), _mm_and_si128(layer3_chunk7, v_mask));
|
||||
__m128i layer2_chunk7 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk6, 16), _mm_srli_epi32(layer3_chunk7, 16));
|
||||
|
||||
__m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
|
||||
__m128i layer1_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16));
|
||||
__m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
|
||||
__m128i layer1_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16));
|
||||
__m128i layer1_chunk2 = _mm_packus_epi32(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
|
||||
__m128i layer1_chunk6 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk4, 16), _mm_srli_epi32(layer2_chunk5, 16));
|
||||
__m128i layer1_chunk3 = _mm_packus_epi32(_mm_and_si128(layer2_chunk6, v_mask), _mm_and_si128(layer2_chunk7, v_mask));
|
||||
__m128i layer1_chunk7 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk6, 16), _mm_srli_epi32(layer2_chunk7, 16));
|
||||
|
||||
v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
|
||||
v_b0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16));
|
||||
v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
|
||||
v_b1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16));
|
||||
v_g0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
|
||||
v_a0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk4, 16), _mm_srli_epi32(layer1_chunk5, 16));
|
||||
v_g1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk6, v_mask), _mm_and_si128(layer1_chunk7, v_mask));
|
||||
v_a1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk6, 16), _mm_srli_epi32(layer1_chunk7, 16));
|
||||
}
|
||||
|
||||
#endif // CV_SSE4_1
|
||||
|
||||
inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1)
|
||||
{
|
||||
__m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_g0);
|
||||
__m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_g0);
|
||||
__m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_g1);
|
||||
__m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_g1);
|
||||
|
||||
__m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk2);
|
||||
__m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk2);
|
||||
__m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk3);
|
||||
__m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk3);
|
||||
|
||||
v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk2);
|
||||
v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk2);
|
||||
v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk3);
|
||||
v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk3);
|
||||
}
|
||||
|
||||
inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0,
|
||||
__m128 & v_g1, __m128 & v_b0, __m128 & v_b1)
|
||||
{
|
||||
__m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_g1);
|
||||
__m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_g1);
|
||||
__m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_b0);
|
||||
__m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_b0);
|
||||
__m128 layer1_chunk4 = _mm_unpacklo_ps(v_g0, v_b1);
|
||||
__m128 layer1_chunk5 = _mm_unpackhi_ps(v_g0, v_b1);
|
||||
|
||||
__m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk3);
|
||||
__m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk3);
|
||||
__m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk4);
|
||||
__m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk4);
|
||||
__m128 layer2_chunk4 = _mm_unpacklo_ps(layer1_chunk2, layer1_chunk5);
|
||||
__m128 layer2_chunk5 = _mm_unpackhi_ps(layer1_chunk2, layer1_chunk5);
|
||||
|
||||
v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk3);
|
||||
v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk3);
|
||||
v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk4);
|
||||
v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk4);
|
||||
v_b0 = _mm_unpacklo_ps(layer2_chunk2, layer2_chunk5);
|
||||
v_b1 = _mm_unpackhi_ps(layer2_chunk2, layer2_chunk5);
|
||||
}
|
||||
|
||||
inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1,
|
||||
__m128 & v_b0, __m128 & v_b1, __m128 & v_a0, __m128 & v_a1)
|
||||
{
|
||||
__m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_b0);
|
||||
__m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_b0);
|
||||
__m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_b1);
|
||||
__m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_b1);
|
||||
__m128 layer1_chunk4 = _mm_unpacklo_ps(v_g0, v_a0);
|
||||
__m128 layer1_chunk5 = _mm_unpackhi_ps(v_g0, v_a0);
|
||||
__m128 layer1_chunk6 = _mm_unpacklo_ps(v_g1, v_a1);
|
||||
__m128 layer1_chunk7 = _mm_unpackhi_ps(v_g1, v_a1);
|
||||
|
||||
__m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk4);
|
||||
__m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk4);
|
||||
__m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk5);
|
||||
__m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk5);
|
||||
__m128 layer2_chunk4 = _mm_unpacklo_ps(layer1_chunk2, layer1_chunk6);
|
||||
__m128 layer2_chunk5 = _mm_unpackhi_ps(layer1_chunk2, layer1_chunk6);
|
||||
__m128 layer2_chunk6 = _mm_unpacklo_ps(layer1_chunk3, layer1_chunk7);
|
||||
__m128 layer2_chunk7 = _mm_unpackhi_ps(layer1_chunk3, layer1_chunk7);
|
||||
|
||||
v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk4);
|
||||
v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk4);
|
||||
v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk5);
|
||||
v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk5);
|
||||
v_b0 = _mm_unpacklo_ps(layer2_chunk2, layer2_chunk6);
|
||||
v_b1 = _mm_unpackhi_ps(layer2_chunk2, layer2_chunk6);
|
||||
v_a0 = _mm_unpacklo_ps(layer2_chunk3, layer2_chunk7);
|
||||
v_a1 = _mm_unpackhi_ps(layer2_chunk3, layer2_chunk7);
|
||||
}
|
||||
|
||||
inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1)
|
||||
{
|
||||
const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
|
||||
|
||||
__m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo);
|
||||
__m128 layer2_chunk2 = _mm_shuffle_ps(v_r0, v_r1, mask_hi);
|
||||
__m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo);
|
||||
__m128 layer2_chunk3 = _mm_shuffle_ps(v_g0, v_g1, mask_hi);
|
||||
|
||||
__m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo);
|
||||
__m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi);
|
||||
__m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo);
|
||||
__m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi);
|
||||
|
||||
v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo);
|
||||
v_g0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi);
|
||||
v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo);
|
||||
v_g1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi);
|
||||
}
|
||||
|
||||
inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0,
|
||||
__m128 & v_g1, __m128 & v_b0, __m128 & v_b1)
|
||||
{
|
||||
const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
|
||||
|
||||
__m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo);
|
||||
__m128 layer2_chunk3 = _mm_shuffle_ps(v_r0, v_r1, mask_hi);
|
||||
__m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo);
|
||||
__m128 layer2_chunk4 = _mm_shuffle_ps(v_g0, v_g1, mask_hi);
|
||||
__m128 layer2_chunk2 = _mm_shuffle_ps(v_b0, v_b1, mask_lo);
|
||||
__m128 layer2_chunk5 = _mm_shuffle_ps(v_b0, v_b1, mask_hi);
|
||||
|
||||
__m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo);
|
||||
__m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi);
|
||||
__m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo);
|
||||
__m128 layer1_chunk4 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi);
|
||||
__m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_lo);
|
||||
__m128 layer1_chunk5 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_hi);
|
||||
|
||||
v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo);
|
||||
v_g1 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi);
|
||||
v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo);
|
||||
v_b0 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi);
|
||||
v_g0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_lo);
|
||||
v_b1 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_hi);
|
||||
}
|
||||
|
||||
inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1,
|
||||
__m128 & v_b0, __m128 & v_b1, __m128 & v_a0, __m128 & v_a1)
|
||||
{
|
||||
const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
|
||||
|
||||
__m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo);
|
||||
__m128 layer2_chunk4 = _mm_shuffle_ps(v_r0, v_r1, mask_hi);
|
||||
__m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo);
|
||||
__m128 layer2_chunk5 = _mm_shuffle_ps(v_g0, v_g1, mask_hi);
|
||||
__m128 layer2_chunk2 = _mm_shuffle_ps(v_b0, v_b1, mask_lo);
|
||||
__m128 layer2_chunk6 = _mm_shuffle_ps(v_b0, v_b1, mask_hi);
|
||||
__m128 layer2_chunk3 = _mm_shuffle_ps(v_a0, v_a1, mask_lo);
|
||||
__m128 layer2_chunk7 = _mm_shuffle_ps(v_a0, v_a1, mask_hi);
|
||||
|
||||
__m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo);
|
||||
__m128 layer1_chunk4 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi);
|
||||
__m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo);
|
||||
__m128 layer1_chunk5 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi);
|
||||
__m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_lo);
|
||||
__m128 layer1_chunk6 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_hi);
|
||||
__m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk6, layer2_chunk7, mask_lo);
|
||||
__m128 layer1_chunk7 = _mm_shuffle_ps(layer2_chunk6, layer2_chunk7, mask_hi);
|
||||
|
||||
v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo);
|
||||
v_b0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi);
|
||||
v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo);
|
||||
v_b1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi);
|
||||
v_g0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_lo);
|
||||
v_a0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_hi);
|
||||
v_g1 = _mm_shuffle_ps(layer1_chunk6, layer1_chunk7, mask_lo);
|
||||
v_a1 = _mm_shuffle_ps(layer1_chunk6, layer1_chunk7, mask_hi);
|
||||
}
|
||||
|
||||
#endif // CV_SSE2
|
||||
|
||||
//! @}
|
||||
|
||||
#endif //__OPENCV_CORE_SSE_UTILS_HPP__
|
File diff suppressed because it is too large
Load Diff
607
modules/core/src/arithm_core.hpp
Normal file
607
modules/core/src/arithm_core.hpp
Normal file
@@ -0,0 +1,607 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
|
||||
// Copyright (C) 2015, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef __OPENCV_ARITHM_CORE_HPP__
|
||||
#define __OPENCV_ARITHM_CORE_HPP__
|
||||
|
||||
#include "arithm_simd.hpp"
|
||||
|
||||
namespace cv {
|
||||
|
||||
template<typename T1, typename T2=T1, typename T3=T1> struct OpAdd
|
||||
{
|
||||
typedef T1 type1;
|
||||
typedef T2 type2;
|
||||
typedef T3 rtype;
|
||||
T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(a + b); }
|
||||
};
|
||||
|
||||
template<typename T1, typename T2=T1, typename T3=T1> struct OpSub
|
||||
{
|
||||
typedef T1 type1;
|
||||
typedef T2 type2;
|
||||
typedef T3 rtype;
|
||||
T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(a - b); }
|
||||
};
|
||||
|
||||
template<typename T1, typename T2=T1, typename T3=T1> struct OpRSub
|
||||
{
|
||||
typedef T1 type1;
|
||||
typedef T2 type2;
|
||||
typedef T3 rtype;
|
||||
T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(b - a); }
|
||||
};
|
||||
|
||||
template<typename T> struct OpMin
|
||||
{
|
||||
typedef T type1;
|
||||
typedef T type2;
|
||||
typedef T rtype;
|
||||
T operator ()(const T a, const T b) const { return std::min(a, b); }
|
||||
};
|
||||
|
||||
template<typename T> struct OpMax
|
||||
{
|
||||
typedef T type1;
|
||||
typedef T type2;
|
||||
typedef T rtype;
|
||||
T operator ()(const T a, const T b) const { return std::max(a, b); }
|
||||
};
|
||||
|
||||
template<typename T> struct OpAbsDiff
|
||||
{
|
||||
typedef T type1;
|
||||
typedef T type2;
|
||||
typedef T rtype;
|
||||
T operator()(T a, T b) const { return a > b ? a - b : b - a; }
|
||||
};
|
||||
|
||||
template<typename T> struct OpAnd
|
||||
{
|
||||
typedef T type1;
|
||||
typedef T type2;
|
||||
typedef T rtype;
|
||||
T operator()( T a, T b ) const { return a & b; }
|
||||
};
|
||||
|
||||
template<typename T> struct OpOr
|
||||
{
|
||||
typedef T type1;
|
||||
typedef T type2;
|
||||
typedef T rtype;
|
||||
T operator()( T a, T b ) const { return a | b; }
|
||||
};
|
||||
|
||||
template<typename T> struct OpXor
|
||||
{
|
||||
typedef T type1;
|
||||
typedef T type2;
|
||||
typedef T rtype;
|
||||
T operator()( T a, T b ) const { return a ^ b; }
|
||||
};
|
||||
|
||||
template<typename T> struct OpNot
|
||||
{
|
||||
typedef T type1;
|
||||
typedef T type2;
|
||||
typedef T rtype;
|
||||
T operator()( T a, T ) const { return ~a; }
|
||||
};
|
||||
|
||||
//=============================================================================
|
||||
|
||||
template<typename T, class Op, class VOp>
|
||||
void vBinOp(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, size_t step, int width, int height)
|
||||
{
|
||||
#if CV_SSE2 || CV_NEON
|
||||
VOp vop;
|
||||
#endif
|
||||
Op op;
|
||||
|
||||
for( ; height--; src1 = (const T *)((const uchar *)src1 + step1),
|
||||
src2 = (const T *)((const uchar *)src2 + step2),
|
||||
dst = (T *)((uchar *)dst + step) )
|
||||
{
|
||||
int x = 0;
|
||||
|
||||
#if CV_NEON || CV_SSE2
|
||||
#if CV_AVX2
|
||||
if( USE_AVX2 )
|
||||
{
|
||||
for( ; x <= width - 32/(int)sizeof(T); x += 32/sizeof(T) )
|
||||
{
|
||||
typename VLoadStore256<T>::reg_type r0 = VLoadStore256<T>::load(src1 + x);
|
||||
r0 = vop(r0, VLoadStore256<T>::load(src2 + x));
|
||||
VLoadStore256<T>::store(dst + x, r0);
|
||||
}
|
||||
}
|
||||
#else
|
||||
#if CV_SSE2
|
||||
if( USE_SSE2 )
|
||||
{
|
||||
#endif // CV_SSE2
|
||||
for( ; x <= width - 32/(int)sizeof(T); x += 32/sizeof(T) )
|
||||
{
|
||||
typename VLoadStore128<T>::reg_type r0 = VLoadStore128<T>::load(src1 + x );
|
||||
typename VLoadStore128<T>::reg_type r1 = VLoadStore128<T>::load(src1 + x + 16/sizeof(T));
|
||||
r0 = vop(r0, VLoadStore128<T>::load(src2 + x ));
|
||||
r1 = vop(r1, VLoadStore128<T>::load(src2 + x + 16/sizeof(T)));
|
||||
VLoadStore128<T>::store(dst + x , r0);
|
||||
VLoadStore128<T>::store(dst + x + 16/sizeof(T), r1);
|
||||
}
|
||||
#if CV_SSE2
|
||||
}
|
||||
#endif // CV_SSE2
|
||||
#endif // CV_AVX2
|
||||
#endif // CV_NEON || CV_SSE2
|
||||
|
||||
#if CV_AVX2
|
||||
// nothing
|
||||
#elif CV_SSE2
|
||||
if( USE_SSE2 )
|
||||
{
|
||||
for( ; x <= width - 8/(int)sizeof(T); x += 8/sizeof(T) )
|
||||
{
|
||||
typename VLoadStore64<T>::reg_type r = VLoadStore64<T>::load(src1 + x);
|
||||
r = vop(r, VLoadStore64<T>::load(src2 + x));
|
||||
VLoadStore64<T>::store(dst + x, r);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#if CV_ENABLE_UNROLLED
|
||||
for( ; x <= width - 4; x += 4 )
|
||||
{
|
||||
T v0 = op(src1[x], src2[x]);
|
||||
T v1 = op(src1[x+1], src2[x+1]);
|
||||
dst[x] = v0; dst[x+1] = v1;
|
||||
v0 = op(src1[x+2], src2[x+2]);
|
||||
v1 = op(src1[x+3], src2[x+3]);
|
||||
dst[x+2] = v0; dst[x+3] = v1;
|
||||
}
|
||||
#endif
|
||||
|
||||
for( ; x < width; x++ )
|
||||
dst[x] = op(src1[x], src2[x]);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T, class Op, class Op32>
|
||||
void vBinOp32(const T* src1, size_t step1, const T* src2, size_t step2,
|
||||
T* dst, size_t step, int width, int height)
|
||||
{
|
||||
#if CV_SSE2 || CV_NEON
|
||||
Op32 op32;
|
||||
#endif
|
||||
Op op;
|
||||
|
||||
for( ; height--; src1 = (const T *)((const uchar *)src1 + step1),
|
||||
src2 = (const T *)((const uchar *)src2 + step2),
|
||||
dst = (T *)((uchar *)dst + step) )
|
||||
{
|
||||
int x = 0;
|
||||
|
||||
#if CV_AVX2
|
||||
if( USE_AVX2 )
|
||||
{
|
||||
if( (((size_t)src1|(size_t)src2|(size_t)dst)&31) == 0 )
|
||||
{
|
||||
for( ; x <= width - 8; x += 8 )
|
||||
{
|
||||
typename VLoadStore256Aligned<T>::reg_type r0 = VLoadStore256Aligned<T>::load(src1 + x);
|
||||
r0 = op32(r0, VLoadStore256Aligned<T>::load(src2 + x));
|
||||
VLoadStore256Aligned<T>::store(dst + x, r0);
|
||||
}
|
||||
}
|
||||
}
|
||||
#elif CV_SSE2
|
||||
if( USE_SSE2 )
|
||||
{
|
||||
if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 )
|
||||
{
|
||||
for( ; x <= width - 8; x += 8 )
|
||||
{
|
||||
typename VLoadStore128Aligned<T>::reg_type r0 = VLoadStore128Aligned<T>::load(src1 + x );
|
||||
typename VLoadStore128Aligned<T>::reg_type r1 = VLoadStore128Aligned<T>::load(src1 + x + 4);
|
||||
r0 = op32(r0, VLoadStore128Aligned<T>::load(src2 + x ));
|
||||
r1 = op32(r1, VLoadStore128Aligned<T>::load(src2 + x + 4));
|
||||
VLoadStore128Aligned<T>::store(dst + x , r0);
|
||||
VLoadStore128Aligned<T>::store(dst + x + 4, r1);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif // CV_AVX2
|
||||
|
||||
#if CV_NEON || CV_SSE2
|
||||
#if CV_AVX2
|
||||
if( USE_AVX2 )
|
||||
{
|
||||
for( ; x <= width - 8; x += 8 )
|
||||
{
|
||||
typename VLoadStore256<T>::reg_type r0 = VLoadStore256<T>::load(src1 + x);
|
||||
r0 = op32(r0, VLoadStore256<T>::load(src2 + x));
|
||||
VLoadStore256<T>::store(dst + x, r0);
|
||||
}
|
||||
}
|
||||
#else
|
||||
#if CV_SSE2
|
||||
if( USE_SSE2 )
|
||||
{
|
||||
#endif // CV_SSE2
|
||||
for( ; x <= width - 8; x += 8 )
|
||||
{
|
||||
typename VLoadStore128<T>::reg_type r0 = VLoadStore128<T>::load(src1 + x );
|
||||
typename VLoadStore128<T>::reg_type r1 = VLoadStore128<T>::load(src1 + x + 4);
|
||||
r0 = op32(r0, VLoadStore128<T>::load(src2 + x ));
|
||||
r1 = op32(r1, VLoadStore128<T>::load(src2 + x + 4));
|
||||
VLoadStore128<T>::store(dst + x , r0);
|
||||
VLoadStore128<T>::store(dst + x + 4, r1);
|
||||
}
|
||||
#if CV_SSE2
|
||||
}
|
||||
#endif // CV_SSE2
|
||||
#endif // CV_AVX2
|
||||
#endif // CV_NEON || CV_SSE2
|
||||
|
||||
#if CV_ENABLE_UNROLLED
|
||||
for( ; x <= width - 4; x += 4 )
|
||||
{
|
||||
T v0 = op(src1[x], src2[x]);
|
||||
T v1 = op(src1[x+1], src2[x+1]);
|
||||
dst[x] = v0; dst[x+1] = v1;
|
||||
v0 = op(src1[x+2], src2[x+2]);
|
||||
v1 = op(src1[x+3], src2[x+3]);
|
||||
dst[x+2] = v0; dst[x+3] = v1;
|
||||
}
|
||||
#endif
|
||||
|
||||
for( ; x < width; x++ )
|
||||
dst[x] = op(src1[x], src2[x]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<typename T, class Op, class Op64>
|
||||
void vBinOp64(const T* src1, size_t step1, const T* src2, size_t step2,
|
||||
T* dst, size_t step, int width, int height)
|
||||
{
|
||||
#if CV_SSE2
|
||||
Op64 op64;
|
||||
#endif
|
||||
Op op;
|
||||
|
||||
for( ; height--; src1 = (const T *)((const uchar *)src1 + step1),
|
||||
src2 = (const T *)((const uchar *)src2 + step2),
|
||||
dst = (T *)((uchar *)dst + step) )
|
||||
{
|
||||
int x = 0;
|
||||
|
||||
#if CV_AVX2
|
||||
if( USE_AVX2 )
|
||||
{
|
||||
if( (((size_t)src1|(size_t)src2|(size_t)dst)&31) == 0 )
|
||||
{
|
||||
for( ; x <= width - 4; x += 4 )
|
||||
{
|
||||
typename VLoadStore256Aligned<T>::reg_type r0 = VLoadStore256Aligned<T>::load(src1 + x);
|
||||
r0 = op64(r0, VLoadStore256Aligned<T>::load(src2 + x));
|
||||
VLoadStore256Aligned<T>::store(dst + x, r0);
|
||||
}
|
||||
}
|
||||
}
|
||||
#elif CV_SSE2
|
||||
if( USE_SSE2 )
|
||||
{
|
||||
if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 )
|
||||
{
|
||||
for( ; x <= width - 4; x += 4 )
|
||||
{
|
||||
typename VLoadStore128Aligned<T>::reg_type r0 = VLoadStore128Aligned<T>::load(src1 + x );
|
||||
typename VLoadStore128Aligned<T>::reg_type r1 = VLoadStore128Aligned<T>::load(src1 + x + 2);
|
||||
r0 = op64(r0, VLoadStore128Aligned<T>::load(src2 + x ));
|
||||
r1 = op64(r1, VLoadStore128Aligned<T>::load(src2 + x + 2));
|
||||
VLoadStore128Aligned<T>::store(dst + x , r0);
|
||||
VLoadStore128Aligned<T>::store(dst + x + 2, r1);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for( ; x <= width - 4; x += 4 )
|
||||
{
|
||||
T v0 = op(src1[x], src2[x]);
|
||||
T v1 = op(src1[x+1], src2[x+1]);
|
||||
dst[x] = v0; dst[x+1] = v1;
|
||||
v0 = op(src1[x+2], src2[x+2]);
|
||||
v1 = op(src1[x+3], src2[x+3]);
|
||||
dst[x+2] = v0; dst[x+3] = v1;
|
||||
}
|
||||
|
||||
for( ; x < width; x++ )
|
||||
dst[x] = op(src1[x], src2[x]);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T> static void
|
||||
cmp_(const T* src1, size_t step1, const T* src2, size_t step2,
|
||||
uchar* dst, size_t step, int width, int height, int code)
|
||||
{
|
||||
step1 /= sizeof(src1[0]);
|
||||
step2 /= sizeof(src2[0]);
|
||||
if( code == CMP_GE || code == CMP_LT )
|
||||
{
|
||||
std::swap(src1, src2);
|
||||
std::swap(step1, step2);
|
||||
code = code == CMP_GE ? CMP_LE : CMP_GT;
|
||||
}
|
||||
|
||||
Cmp_SIMD<T> vop(code);
|
||||
|
||||
if( code == CMP_GT || code == CMP_LE )
|
||||
{
|
||||
int m = code == CMP_GT ? 0 : 255;
|
||||
for( ; height--; src1 += step1, src2 += step2, dst += step )
|
||||
{
|
||||
int x = vop(src1, src2, dst, width);
|
||||
#if CV_ENABLE_UNROLLED
|
||||
for( ; x <= width - 4; x += 4 )
|
||||
{
|
||||
int t0, t1;
|
||||
t0 = -(src1[x] > src2[x]) ^ m;
|
||||
t1 = -(src1[x+1] > src2[x+1]) ^ m;
|
||||
dst[x] = (uchar)t0; dst[x+1] = (uchar)t1;
|
||||
t0 = -(src1[x+2] > src2[x+2]) ^ m;
|
||||
t1 = -(src1[x+3] > src2[x+3]) ^ m;
|
||||
dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1;
|
||||
}
|
||||
#endif
|
||||
for( ; x < width; x++ )
|
||||
dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m);
|
||||
}
|
||||
}
|
||||
else if( code == CMP_EQ || code == CMP_NE )
|
||||
{
|
||||
int m = code == CMP_EQ ? 0 : 255;
|
||||
for( ; height--; src1 += step1, src2 += step2, dst += step )
|
||||
{
|
||||
int x = 0;
|
||||
#if CV_ENABLE_UNROLLED
|
||||
for( ; x <= width - 4; x += 4 )
|
||||
{
|
||||
int t0, t1;
|
||||
t0 = -(src1[x] == src2[x]) ^ m;
|
||||
t1 = -(src1[x+1] == src2[x+1]) ^ m;
|
||||
dst[x] = (uchar)t0; dst[x+1] = (uchar)t1;
|
||||
t0 = -(src1[x+2] == src2[x+2]) ^ m;
|
||||
t1 = -(src1[x+3] == src2[x+3]) ^ m;
|
||||
dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1;
|
||||
}
|
||||
#endif
|
||||
for( ; x < width; x++ )
|
||||
dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T, typename WT> static void
|
||||
mul_( const T* src1, size_t step1, const T* src2, size_t step2,
|
||||
T* dst, size_t step, int width, int height, WT scale )
|
||||
{
|
||||
step1 /= sizeof(src1[0]);
|
||||
step2 /= sizeof(src2[0]);
|
||||
step /= sizeof(dst[0]);
|
||||
|
||||
Mul_SIMD<T, WT> vop;
|
||||
|
||||
if( scale == (WT)1. )
|
||||
{
|
||||
for( ; height--; src1 += step1, src2 += step2, dst += step )
|
||||
{
|
||||
int i = vop(src1, src2, dst, width, scale);
|
||||
#if CV_ENABLE_UNROLLED
|
||||
for(; i <= width - 4; i += 4 )
|
||||
{
|
||||
T t0;
|
||||
T t1;
|
||||
t0 = saturate_cast<T>(src1[i ] * src2[i ]);
|
||||
t1 = saturate_cast<T>(src1[i+1] * src2[i+1]);
|
||||
dst[i ] = t0;
|
||||
dst[i+1] = t1;
|
||||
|
||||
t0 = saturate_cast<T>(src1[i+2] * src2[i+2]);
|
||||
t1 = saturate_cast<T>(src1[i+3] * src2[i+3]);
|
||||
dst[i+2] = t0;
|
||||
dst[i+3] = t1;
|
||||
}
|
||||
#endif
|
||||
for( ; i < width; i++ )
|
||||
dst[i] = saturate_cast<T>(src1[i] * src2[i]);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for( ; height--; src1 += step1, src2 += step2, dst += step )
|
||||
{
|
||||
int i = vop(src1, src2, dst, width, scale);
|
||||
#if CV_ENABLE_UNROLLED
|
||||
for(; i <= width - 4; i += 4 )
|
||||
{
|
||||
T t0 = saturate_cast<T>(scale*(WT)src1[i]*src2[i]);
|
||||
T t1 = saturate_cast<T>(scale*(WT)src1[i+1]*src2[i+1]);
|
||||
dst[i] = t0; dst[i+1] = t1;
|
||||
|
||||
t0 = saturate_cast<T>(scale*(WT)src1[i+2]*src2[i+2]);
|
||||
t1 = saturate_cast<T>(scale*(WT)src1[i+3]*src2[i+3]);
|
||||
dst[i+2] = t0; dst[i+3] = t1;
|
||||
}
|
||||
#endif
|
||||
for( ; i < width; i++ )
|
||||
dst[i] = saturate_cast<T>(scale*(WT)src1[i]*src2[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<typename T> static void
|
||||
div_i( const T* src1, size_t step1, const T* src2, size_t step2,
|
||||
T* dst, size_t step, int width, int height, double scale )
|
||||
{
|
||||
step1 /= sizeof(src1[0]);
|
||||
step2 /= sizeof(src2[0]);
|
||||
step /= sizeof(dst[0]);
|
||||
|
||||
Div_SIMD<T> vop;
|
||||
float scale_f = (float)scale;
|
||||
|
||||
for( ; height--; src1 += step1, src2 += step2, dst += step )
|
||||
{
|
||||
int i = vop(src1, src2, dst, width, scale);
|
||||
for( ; i < width; i++ )
|
||||
{
|
||||
T num = src1[i], denom = src2[i];
|
||||
dst[i] = denom != 0 ? saturate_cast<T>(num*scale_f/denom) : (T)0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T> static void
|
||||
div_f( const T* src1, size_t step1, const T* src2, size_t step2,
|
||||
T* dst, size_t step, int width, int height, double scale )
|
||||
{
|
||||
T scale_f = (T)scale;
|
||||
step1 /= sizeof(src1[0]);
|
||||
step2 /= sizeof(src2[0]);
|
||||
step /= sizeof(dst[0]);
|
||||
|
||||
Div_SIMD<T> vop;
|
||||
|
||||
for( ; height--; src1 += step1, src2 += step2, dst += step )
|
||||
{
|
||||
int i = vop(src1, src2, dst, width, scale);
|
||||
for( ; i < width; i++ )
|
||||
{
|
||||
T num = src1[i], denom = src2[i];
|
||||
dst[i] = denom != 0 ? saturate_cast<T>(num*scale_f/denom) : (T)0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T> static void
|
||||
recip_i( const T*, size_t, const T* src2, size_t step2,
|
||||
T* dst, size_t step, int width, int height, double scale )
|
||||
{
|
||||
step2 /= sizeof(src2[0]);
|
||||
step /= sizeof(dst[0]);
|
||||
|
||||
Recip_SIMD<T> vop;
|
||||
float scale_f = (float)scale;
|
||||
|
||||
for( ; height--; src2 += step2, dst += step )
|
||||
{
|
||||
int i = vop(src2, dst, width, scale);
|
||||
for( ; i < width; i++ )
|
||||
{
|
||||
T denom = src2[i];
|
||||
dst[i] = denom != 0 ? saturate_cast<T>(scale_f/denom) : (T)0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T> static void
|
||||
recip_f( const T*, size_t, const T* src2, size_t step2,
|
||||
T* dst, size_t step, int width, int height, double scale )
|
||||
{
|
||||
T scale_f = (T)scale;
|
||||
step2 /= sizeof(src2[0]);
|
||||
step /= sizeof(dst[0]);
|
||||
|
||||
Recip_SIMD<T> vop;
|
||||
|
||||
for( ; height--; src2 += step2, dst += step )
|
||||
{
|
||||
int i = vop(src2, dst, width, scale);
|
||||
for( ; i < width; i++ )
|
||||
{
|
||||
T denom = src2[i];
|
||||
dst[i] = denom != 0 ? saturate_cast<T>(scale_f/denom) : (T)0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T, typename WT> static void
|
||||
addWeighted_( const T* src1, size_t step1, const T* src2, size_t step2,
|
||||
T* dst, size_t step, int width, int height, void* _scalars )
|
||||
{
|
||||
const double* scalars = (const double*)_scalars;
|
||||
WT alpha = (WT)scalars[0], beta = (WT)scalars[1], gamma = (WT)scalars[2];
|
||||
step1 /= sizeof(src1[0]);
|
||||
step2 /= sizeof(src2[0]);
|
||||
step /= sizeof(dst[0]);
|
||||
|
||||
AddWeighted_SIMD<T, WT> vop;
|
||||
|
||||
for( ; height--; src1 += step1, src2 += step2, dst += step )
|
||||
{
|
||||
int x = vop(src1, src2, dst, width, alpha, beta, gamma);
|
||||
#if CV_ENABLE_UNROLLED
|
||||
for( ; x <= width - 4; x += 4 )
|
||||
{
|
||||
T t0 = saturate_cast<T>(src1[x]*alpha + src2[x]*beta + gamma);
|
||||
T t1 = saturate_cast<T>(src1[x+1]*alpha + src2[x+1]*beta + gamma);
|
||||
dst[x] = t0; dst[x+1] = t1;
|
||||
|
||||
t0 = saturate_cast<T>(src1[x+2]*alpha + src2[x+2]*beta + gamma);
|
||||
t1 = saturate_cast<T>(src1[x+3]*alpha + src2[x+3]*beta + gamma);
|
||||
dst[x+2] = t0; dst[x+3] = t1;
|
||||
}
|
||||
#endif
|
||||
for( ; x < width; x++ )
|
||||
dst[x] = saturate_cast<T>(src1[x]*alpha + src2[x]*beta + gamma);
|
||||
}
|
||||
}
|
||||
|
||||
} // cv::
|
||||
|
||||
|
||||
#endif // __OPENCV_ARITHM_CORE_HPP__
|
2025
modules/core/src/arithm_simd.hpp
Normal file
2025
modules/core/src/arithm_simd.hpp
Normal file
File diff suppressed because it is too large
Load Diff
228
modules/core/src/hal_replacement.hpp
Normal file
228
modules/core/src/hal_replacement.hpp
Normal file
@@ -0,0 +1,228 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
|
||||
// Copyright (C) 2015, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef __OPENCV_CORE_HAL_REPLACEMENT_HPP__
|
||||
#define __OPENCV_CORE_HAL_REPLACEMENT_HPP__
|
||||
|
||||
#include "opencv2/core/hal/interface.h"
|
||||
|
||||
inline int hal_ni_add8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_add8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_add16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_add16s(const short*, size_t, const short*, size_t, short*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_add32s(const int*, size_t, const int*, size_t, int*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_add32f(const float*, size_t, const float*, size_t, float*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_add64f(const double*, size_t, const double*, size_t, double*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_sub8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_sub8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_sub16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_sub16s(const short*, size_t, const short*, size_t, short*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_sub32s(const int*, size_t, const int*, size_t, int*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_sub32f(const float*, size_t, const float*, size_t, float*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_sub64f(const double*, size_t, const double*, size_t, double*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_max8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_max8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_max16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_max16s(const short*, size_t, const short*, size_t, short*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_max32s(const int*, size_t, const int*, size_t, int*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_max32f(const float*, size_t, const float*, size_t, float*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_max64f(const double*, size_t, const double*, size_t, double*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_min8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_min8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_min16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_min16s(const short*, size_t, const short*, size_t, short*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_min32s(const int*, size_t, const int*, size_t, int*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_min32f(const float*, size_t, const float*, size_t, float*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_min64f(const double*, size_t, const double*, size_t, double*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_absdiff8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_absdiff8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_absdiff16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_absdiff16s(const short*, size_t, const short*, size_t, short*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_absdiff32s(const int*, size_t, const int*, size_t, int*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_absdiff32f(const float*, size_t, const float*, size_t, float*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_absdiff64f(const double*, size_t, const double*, size_t, double*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_and8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_or8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_xor8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_not8u(const uchar*, size_t, uchar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
|
||||
#define cv_hal_add8u hal_ni_add8u
|
||||
#define cv_hal_add8s hal_ni_add8s
|
||||
#define cv_hal_add16u hal_ni_add16u
|
||||
#define cv_hal_add16s hal_ni_add16s
|
||||
#define cv_hal_add32s hal_ni_add32s
|
||||
#define cv_hal_add32f hal_ni_add32f
|
||||
#define cv_hal_add64f hal_ni_add64f
|
||||
#define cv_hal_sub8u hal_ni_sub8u
|
||||
#define cv_hal_sub8s hal_ni_sub8s
|
||||
#define cv_hal_sub16u hal_ni_sub16u
|
||||
#define cv_hal_sub16s hal_ni_sub16s
|
||||
#define cv_hal_sub32s hal_ni_sub32s
|
||||
#define cv_hal_sub32f hal_ni_sub32f
|
||||
#define cv_hal_sub64f hal_ni_sub64f
|
||||
#define cv_hal_max8u hal_ni_max8u
|
||||
#define cv_hal_max8s hal_ni_max8s
|
||||
#define cv_hal_max16u hal_ni_max16u
|
||||
#define cv_hal_max16s hal_ni_max16s
|
||||
#define cv_hal_max32s hal_ni_max32s
|
||||
#define cv_hal_max32f hal_ni_max32f
|
||||
#define cv_hal_max64f hal_ni_max64f
|
||||
#define cv_hal_min8u hal_ni_min8u
|
||||
#define cv_hal_min8s hal_ni_min8s
|
||||
#define cv_hal_min16u hal_ni_min16u
|
||||
#define cv_hal_min16s hal_ni_min16s
|
||||
#define cv_hal_min32s hal_ni_min32s
|
||||
#define cv_hal_min32f hal_ni_min32f
|
||||
#define cv_hal_min64f hal_ni_min64f
|
||||
#define cv_hal_absdiff8u hal_ni_absdiff8u
|
||||
#define cv_hal_absdiff8s hal_ni_absdiff8s
|
||||
#define cv_hal_absdiff16u hal_ni_absdiff16u
|
||||
#define cv_hal_absdiff16s hal_ni_absdiff16s
|
||||
#define cv_hal_absdiff32s hal_ni_absdiff32s
|
||||
#define cv_hal_absdiff32f hal_ni_absdiff32f
|
||||
#define cv_hal_absdiff64f hal_ni_absdiff64f
|
||||
#define cv_hal_and8u hal_ni_and8u
|
||||
#define cv_hal_or8u hal_ni_or8u
|
||||
#define cv_hal_xor8u hal_ni_xor8u
|
||||
#define cv_hal_not8u hal_ni_not8u
|
||||
|
||||
inline int hal_ni_cmp8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_cmp8s(const schar*, size_t, const schar*, size_t, uchar*, size_t, int, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_cmp16u(const ushort*, size_t, const ushort*, size_t, uchar*, size_t, int, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_cmp16s(const short*, size_t, const short*, size_t, uchar*, size_t, int, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_cmp32s(const int*, size_t, const int*, size_t, uchar*, size_t, int, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_cmp32f(const float*, size_t, const float*, size_t, uchar*, size_t, int, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_cmp64f(const double*, size_t, const double*, size_t, uchar*, size_t, int, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
|
||||
#define cv_hal_cmp8u hal_ni_cmp8u
|
||||
#define cv_hal_cmp8s hal_ni_cmp8s
|
||||
#define cv_hal_cmp16u hal_ni_cmp16u
|
||||
#define cv_hal_cmp16s hal_ni_cmp16s
|
||||
#define cv_hal_cmp32s hal_ni_cmp32s
|
||||
#define cv_hal_cmp32f hal_ni_cmp32f
|
||||
#define cv_hal_cmp64f hal_ni_cmp64f
|
||||
|
||||
inline int hal_ni_mul8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_mul8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_mul16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_mul16s(const short*, size_t, const short*, size_t, short*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_mul32s(const int*, size_t, const int*, size_t, int*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_mul32f(const float*, size_t, const float*, size_t, float*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_mul64f(const double*, size_t, const double*, size_t, double*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_div8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_div8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_div16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_div16s(const short*, size_t, const short*, size_t, short*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_div32s(const int*, size_t, const int*, size_t, int*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_div32f(const float*, size_t, const float*, size_t, float*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_div64f(const double*, size_t, const double*, size_t, double*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_recip8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_recip8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_recip16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_recip16s(const short*, size_t, const short*, size_t, short*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_recip32s(const int*, size_t, const int*, size_t, int*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_recip32f(const float*, size_t, const float*, size_t, float*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_recip64f(const double*, size_t, const double*, size_t, double*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
|
||||
#define cv_hal_mul8u hal_ni_mul8u
|
||||
#define cv_hal_mul8s hal_ni_mul8s
|
||||
#define cv_hal_mul16u hal_ni_mul16u
|
||||
#define cv_hal_mul16s hal_ni_mul16s
|
||||
#define cv_hal_mul32s hal_ni_mul32s
|
||||
#define cv_hal_mul32f hal_ni_mul32f
|
||||
#define cv_hal_mul64f hal_ni_mul64f
|
||||
#define cv_hal_div8u hal_ni_div8u
|
||||
#define cv_hal_div8s hal_ni_div8s
|
||||
#define cv_hal_div16u hal_ni_div16u
|
||||
#define cv_hal_div16s hal_ni_div16s
|
||||
#define cv_hal_div32s hal_ni_div32s
|
||||
#define cv_hal_div32f hal_ni_div32f
|
||||
#define cv_hal_div64f hal_ni_div64f
|
||||
#define cv_hal_recip8u hal_ni_recip8u
|
||||
#define cv_hal_recip8s hal_ni_recip8s
|
||||
#define cv_hal_recip16u hal_ni_recip16u
|
||||
#define cv_hal_recip16s hal_ni_recip16s
|
||||
#define cv_hal_recip32s hal_ni_recip32s
|
||||
#define cv_hal_recip32f hal_ni_recip32f
|
||||
#define cv_hal_recip64f hal_ni_recip64f
|
||||
|
||||
inline int hal_ni_addWeighted8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int, const double*) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_addWeighted8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int, const double*) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_addWeighted16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int, const double*) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_addWeighted16s(const short*, size_t, const short*, size_t, short*, size_t, int, int, const double*) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_addWeighted32s(const int*, size_t, const int*, size_t, int*, size_t, int, int, const double*) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_addWeighted32f(const float*, size_t, const float*, size_t, float*, size_t, int, int, const double*) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_addWeighted64f(const double*, size_t, const double*, size_t, double*, size_t, int, int, const double*) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
|
||||
#define cv_hal_addWeighted8u hal_ni_addWeighted8u
|
||||
#define cv_hal_addWeighted8s hal_ni_addWeighted8s
|
||||
#define cv_hal_addWeighted16u hal_ni_addWeighted16u
|
||||
#define cv_hal_addWeighted16s hal_ni_addWeighted16s
|
||||
#define cv_hal_addWeighted32s hal_ni_addWeighted32s
|
||||
#define cv_hal_addWeighted32f hal_ni_addWeighted32f
|
||||
#define cv_hal_addWeighted64f hal_ni_addWeighted64f
|
||||
|
||||
inline int hal_ni_split8u(const uchar*, uchar**, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_split16u(const ushort*, ushort**, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_split32s(const int*, int**, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_split64s(const int64*, int64**, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
|
||||
#define cv_hal_split8u hal_ni_split8u
|
||||
#define cv_hal_split16u hal_ni_split16u
|
||||
#define cv_hal_split32s hal_ni_split32s
|
||||
#define cv_hal_split64s hal_ni_split64s
|
||||
|
||||
inline int hal_ni_merge8u(const uchar**, uchar*, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_merge16u(const ushort**, ushort*, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_merge32s(const int**, int*, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_merge64s(const int64**, int64*, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
|
||||
#define cv_hal_merge8u hal_ni_merge8u
|
||||
#define cv_hal_merge16u hal_ni_merge16u
|
||||
#define cv_hal_merge32s hal_ni_merge32s
|
||||
#define cv_hal_merge64s hal_ni_merge64s
|
||||
|
||||
#include "custom_hal.hpp"
|
||||
|
||||
#endif
|
@@ -52,22 +52,22 @@ namespace cv
|
||||
|
||||
int LU(float* A, size_t astep, int m, float* b, size_t bstep, int n)
|
||||
{
|
||||
return hal::LU(A, astep, m, b, bstep, n);
|
||||
return hal::LU32f(A, astep, m, b, bstep, n);
|
||||
}
|
||||
|
||||
int LU(double* A, size_t astep, int m, double* b, size_t bstep, int n)
|
||||
{
|
||||
return hal::LU(A, astep, m, b, bstep, n);
|
||||
return hal::LU64f(A, astep, m, b, bstep, n);
|
||||
}
|
||||
|
||||
bool Cholesky(float* A, size_t astep, int m, float* b, size_t bstep, int n)
|
||||
{
|
||||
return hal::Cholesky(A, astep, m, b, bstep, n);
|
||||
return hal::Cholesky32f(A, astep, m, b, bstep, n);
|
||||
}
|
||||
|
||||
bool Cholesky(double* A, size_t astep, int m, double* b, size_t bstep, int n)
|
||||
{
|
||||
return hal::Cholesky(A, astep, m, b, bstep, n);
|
||||
return hal::Cholesky64f(A, astep, m, b, bstep, n);
|
||||
}
|
||||
|
||||
template<typename _Tp> static inline _Tp hypot(_Tp a, _Tp b)
|
||||
@@ -740,7 +740,7 @@ double cv::determinant( InputArray _mat )
|
||||
Mat a(rows, rows, CV_32F, (uchar*)buffer);
|
||||
mat.copyTo(a);
|
||||
|
||||
result = hal::LU(a.ptr<float>(), a.step, rows, 0, 0, 0);
|
||||
result = hal::LU32f(a.ptr<float>(), a.step, rows, 0, 0, 0);
|
||||
if( result )
|
||||
{
|
||||
for( int i = 0; i < rows; i++ )
|
||||
@@ -764,7 +764,7 @@ double cv::determinant( InputArray _mat )
|
||||
Mat a(rows, rows, CV_64F, (uchar*)buffer);
|
||||
mat.copyTo(a);
|
||||
|
||||
result = hal::LU(a.ptr<double>(), a.step, rows, 0, 0, 0);
|
||||
result = hal::LU64f(a.ptr<double>(), a.step, rows, 0, 0, 0);
|
||||
if( result )
|
||||
{
|
||||
for( int i = 0; i < rows; i++ )
|
||||
@@ -1027,13 +1027,13 @@ double cv::invert( InputArray _src, OutputArray _dst, int method )
|
||||
setIdentity(dst);
|
||||
|
||||
if( method == DECOMP_LU && type == CV_32F )
|
||||
result = hal::LU(src1.ptr<float>(), src1.step, n, dst.ptr<float>(), dst.step, n) != 0;
|
||||
result = hal::LU32f(src1.ptr<float>(), src1.step, n, dst.ptr<float>(), dst.step, n) != 0;
|
||||
else if( method == DECOMP_LU && type == CV_64F )
|
||||
result = hal::LU(src1.ptr<double>(), src1.step, n, dst.ptr<double>(), dst.step, n) != 0;
|
||||
result = hal::LU64f(src1.ptr<double>(), src1.step, n, dst.ptr<double>(), dst.step, n) != 0;
|
||||
else if( method == DECOMP_CHOLESKY && type == CV_32F )
|
||||
result = hal::Cholesky(src1.ptr<float>(), src1.step, n, dst.ptr<float>(), dst.step, n);
|
||||
result = hal::Cholesky32f(src1.ptr<float>(), src1.step, n, dst.ptr<float>(), dst.step, n);
|
||||
else
|
||||
result = hal::Cholesky(src1.ptr<double>(), src1.step, n, dst.ptr<double>(), dst.step, n);
|
||||
result = hal::Cholesky64f(src1.ptr<double>(), src1.step, n, dst.ptr<double>(), dst.step, n);
|
||||
|
||||
if( !result )
|
||||
dst = Scalar(0);
|
||||
@@ -1265,16 +1265,16 @@ bool cv::solve( InputArray _src, InputArray _src2arg, OutputArray _dst, int meth
|
||||
if( method == DECOMP_LU )
|
||||
{
|
||||
if( type == CV_32F )
|
||||
result = hal::LU(a.ptr<float>(), a.step, n, dst.ptr<float>(), dst.step, nb) != 0;
|
||||
result = hal::LU32f(a.ptr<float>(), a.step, n, dst.ptr<float>(), dst.step, nb) != 0;
|
||||
else
|
||||
result = hal::LU(a.ptr<double>(), a.step, n, dst.ptr<double>(), dst.step, nb) != 0;
|
||||
result = hal::LU64f(a.ptr<double>(), a.step, n, dst.ptr<double>(), dst.step, nb) != 0;
|
||||
}
|
||||
else if( method == DECOMP_CHOLESKY )
|
||||
{
|
||||
if( type == CV_32F )
|
||||
result = hal::Cholesky(a.ptr<float>(), a.step, n, dst.ptr<float>(), dst.step, nb);
|
||||
result = hal::Cholesky32f(a.ptr<float>(), a.step, n, dst.ptr<float>(), dst.step, nb);
|
||||
else
|
||||
result = hal::Cholesky(a.ptr<double>(), a.step, n, dst.ptr<double>(), dst.step, nb);
|
||||
result = hal::Cholesky64f(a.ptr<double>(), a.step, n, dst.ptr<double>(), dst.step, nb);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@@ -191,13 +191,13 @@ void magnitude( InputArray src1, InputArray src2, OutputArray dst )
|
||||
{
|
||||
const float *x = (const float*)ptrs[0], *y = (const float*)ptrs[1];
|
||||
float *mag = (float*)ptrs[2];
|
||||
hal::magnitude( x, y, mag, len );
|
||||
hal::magnitude32f( x, y, mag, len );
|
||||
}
|
||||
else
|
||||
{
|
||||
const double *x = (const double*)ptrs[0], *y = (const double*)ptrs[1];
|
||||
double *mag = (double*)ptrs[2];
|
||||
hal::magnitude( x, y, mag, len );
|
||||
hal::magnitude64f( x, y, mag, len );
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -374,7 +374,7 @@ void cartToPolar( InputArray src1, InputArray src2,
|
||||
{
|
||||
const float *x = (const float*)ptrs[0], *y = (const float*)ptrs[1];
|
||||
float *mag = (float*)ptrs[2], *angle = (float*)ptrs[3];
|
||||
hal::magnitude( x, y, mag, len );
|
||||
hal::magnitude32f( x, y, mag, len );
|
||||
hal::fastAtan2( y, x, angle, len, angleInDegrees );
|
||||
}
|
||||
else
|
||||
@@ -382,7 +382,7 @@ void cartToPolar( InputArray src1, InputArray src2,
|
||||
const double *x = (const double*)ptrs[0], *y = (const double*)ptrs[1];
|
||||
double *angle = (double*)ptrs[3];
|
||||
|
||||
hal::magnitude(x, y, (double*)ptrs[2], len);
|
||||
hal::magnitude64f(x, y, (double*)ptrs[2], len);
|
||||
k = 0;
|
||||
|
||||
#if CV_SSE2
|
||||
@@ -760,7 +760,7 @@ static void Exp_32f_ipp(const float *x, float *y, int n)
|
||||
}
|
||||
setIppErrorStatus();
|
||||
}
|
||||
hal::exp(x, y, n);
|
||||
hal::exp32f(x, y, n);
|
||||
}
|
||||
|
||||
static void Exp_64f_ipp(const double *x, double *y, int n)
|
||||
@@ -774,14 +774,14 @@ static void Exp_64f_ipp(const double *x, double *y, int n)
|
||||
}
|
||||
setIppErrorStatus();
|
||||
}
|
||||
hal::exp(x, y, n);
|
||||
hal::exp64f(x, y, n);
|
||||
}
|
||||
|
||||
#define Exp_32f Exp_32f_ipp
|
||||
#define Exp_64f Exp_64f_ipp
|
||||
#else
|
||||
#define Exp_32f hal::exp
|
||||
#define Exp_64f hal::exp
|
||||
#define Exp_32f hal::exp32f
|
||||
#define Exp_64f hal::exp64f
|
||||
#endif
|
||||
|
||||
|
||||
@@ -828,7 +828,7 @@ static void Log_32f_ipp(const float *x, float *y, int n)
|
||||
}
|
||||
setIppErrorStatus();
|
||||
}
|
||||
hal::log(x, y, n);
|
||||
hal::log32f(x, y, n);
|
||||
}
|
||||
|
||||
static void Log_64f_ipp(const double *x, double *y, int n)
|
||||
@@ -842,14 +842,14 @@ static void Log_64f_ipp(const double *x, double *y, int n)
|
||||
}
|
||||
setIppErrorStatus();
|
||||
}
|
||||
hal::log(x, y, n);
|
||||
hal::log64f(x, y, n);
|
||||
}
|
||||
|
||||
#define Log_32f Log_32f_ipp
|
||||
#define Log_64f Log_64f_ipp
|
||||
#else
|
||||
#define Log_32f hal::log
|
||||
#define Log_64f hal::log
|
||||
#define Log_32f hal::log32f
|
||||
#define Log_64f hal::log64f
|
||||
#endif
|
||||
|
||||
void log( InputArray _src, OutputArray _dst )
|
||||
@@ -1356,10 +1356,10 @@ static bool ocl_pow(InputArray _src, double power, OutputArray _dst,
|
||||
|
||||
#endif
|
||||
|
||||
static void InvSqrt_32f(const float* src, float* dst, int n) { hal::invSqrt(src, dst, n); }
|
||||
static void InvSqrt_64f(const double* src, double* dst, int n) { hal::invSqrt(src, dst, n); }
|
||||
static void Sqrt_32f(const float* src, float* dst, int n) { hal::sqrt(src, dst, n); }
|
||||
static void Sqrt_64f(const double* src, double* dst, int n) { hal::sqrt(src, dst, n); }
|
||||
static void InvSqrt_32f(const float* src, float* dst, int n) { hal::invSqrt32f(src, dst, n); }
|
||||
static void InvSqrt_64f(const double* src, double* dst, int n) { hal::invSqrt64f(src, dst, n); }
|
||||
static void Sqrt_32f(const float* src, float* dst, int n) { hal::sqrt32f(src, dst, n); }
|
||||
static void Sqrt_64f(const double* src, double* dst, int n) { hal::sqrt64f(src, dst, n); }
|
||||
|
||||
void pow( InputArray _src, double power, OutputArray _dst )
|
||||
{
|
||||
|
1460
modules/core/src/mathfuncs_core.cpp
Normal file
1460
modules/core/src/mathfuncs_core.cpp
Normal file
File diff suppressed because it is too large
Load Diff
231
modules/core/src/matrix_decomp.cpp
Normal file
231
modules/core/src/matrix_decomp.cpp
Normal file
@@ -0,0 +1,231 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#include "precomp.hpp"
|
||||
|
||||
namespace cv { namespace hal {
|
||||
|
||||
/****************************************************************************************\
|
||||
* LU & Cholesky implementation for small matrices *
|
||||
\****************************************************************************************/
|
||||
|
||||
template<typename _Tp> static inline int
|
||||
LUImpl(_Tp* A, size_t astep, int m, _Tp* b, size_t bstep, int n, _Tp eps)
|
||||
{
|
||||
int i, j, k, p = 1;
|
||||
astep /= sizeof(A[0]);
|
||||
bstep /= sizeof(b[0]);
|
||||
|
||||
for( i = 0; i < m; i++ )
|
||||
{
|
||||
k = i;
|
||||
|
||||
for( j = i+1; j < m; j++ )
|
||||
if( std::abs(A[j*astep + i]) > std::abs(A[k*astep + i]) )
|
||||
k = j;
|
||||
|
||||
if( std::abs(A[k*astep + i]) < eps )
|
||||
return 0;
|
||||
|
||||
if( k != i )
|
||||
{
|
||||
for( j = i; j < m; j++ )
|
||||
std::swap(A[i*astep + j], A[k*astep + j]);
|
||||
if( b )
|
||||
for( j = 0; j < n; j++ )
|
||||
std::swap(b[i*bstep + j], b[k*bstep + j]);
|
||||
p = -p;
|
||||
}
|
||||
|
||||
_Tp d = -1/A[i*astep + i];
|
||||
|
||||
for( j = i+1; j < m; j++ )
|
||||
{
|
||||
_Tp alpha = A[j*astep + i]*d;
|
||||
|
||||
for( k = i+1; k < m; k++ )
|
||||
A[j*astep + k] += alpha*A[i*astep + k];
|
||||
|
||||
if( b )
|
||||
for( k = 0; k < n; k++ )
|
||||
b[j*bstep + k] += alpha*b[i*bstep + k];
|
||||
}
|
||||
|
||||
A[i*astep + i] = -d;
|
||||
}
|
||||
|
||||
if( b )
|
||||
{
|
||||
for( i = m-1; i >= 0; i-- )
|
||||
for( j = 0; j < n; j++ )
|
||||
{
|
||||
_Tp s = b[i*bstep + j];
|
||||
for( k = i+1; k < m; k++ )
|
||||
s -= A[i*astep + k]*b[k*bstep + j];
|
||||
b[i*bstep + j] = s*A[i*astep + i];
|
||||
}
|
||||
}
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
|
||||
int LU32f(float* A, size_t astep, int m, float* b, size_t bstep, int n)
|
||||
{
|
||||
return LUImpl(A, astep, m, b, bstep, n, FLT_EPSILON*10);
|
||||
}
|
||||
|
||||
|
||||
int LU64f(double* A, size_t astep, int m, double* b, size_t bstep, int n)
|
||||
{
|
||||
return LUImpl(A, astep, m, b, bstep, n, DBL_EPSILON*100);
|
||||
}
|
||||
|
||||
template<typename _Tp> static inline bool
|
||||
CholImpl(_Tp* A, size_t astep, int m, _Tp* b, size_t bstep, int n)
|
||||
{
|
||||
_Tp* L = A;
|
||||
int i, j, k;
|
||||
double s;
|
||||
astep /= sizeof(A[0]);
|
||||
bstep /= sizeof(b[0]);
|
||||
|
||||
for( i = 0; i < m; i++ )
|
||||
{
|
||||
for( j = 0; j < i; j++ )
|
||||
{
|
||||
s = A[i*astep + j];
|
||||
for( k = 0; k < j; k++ )
|
||||
s -= L[i*astep + k]*L[j*astep + k];
|
||||
L[i*astep + j] = (_Tp)(s*L[j*astep + j]);
|
||||
}
|
||||
s = A[i*astep + i];
|
||||
for( k = 0; k < j; k++ )
|
||||
{
|
||||
double t = L[i*astep + k];
|
||||
s -= t*t;
|
||||
}
|
||||
if( s < std::numeric_limits<_Tp>::epsilon() )
|
||||
return false;
|
||||
L[i*astep + i] = (_Tp)(1./std::sqrt(s));
|
||||
}
|
||||
|
||||
if( !b )
|
||||
return true;
|
||||
|
||||
// LLt x = b
|
||||
// 1: L y = b
|
||||
// 2. Lt x = y
|
||||
|
||||
/*
|
||||
[ L00 ] y0 b0
|
||||
[ L10 L11 ] y1 = b1
|
||||
[ L20 L21 L22 ] y2 b2
|
||||
[ L30 L31 L32 L33 ] y3 b3
|
||||
|
||||
[ L00 L10 L20 L30 ] x0 y0
|
||||
[ L11 L21 L31 ] x1 = y1
|
||||
[ L22 L32 ] x2 y2
|
||||
[ L33 ] x3 y3
|
||||
*/
|
||||
|
||||
for( i = 0; i < m; i++ )
|
||||
{
|
||||
for( j = 0; j < n; j++ )
|
||||
{
|
||||
s = b[i*bstep + j];
|
||||
for( k = 0; k < i; k++ )
|
||||
s -= L[i*astep + k]*b[k*bstep + j];
|
||||
b[i*bstep + j] = (_Tp)(s*L[i*astep + i]);
|
||||
}
|
||||
}
|
||||
|
||||
for( i = m-1; i >= 0; i-- )
|
||||
{
|
||||
for( j = 0; j < n; j++ )
|
||||
{
|
||||
s = b[i*bstep + j];
|
||||
for( k = m-1; k > i; k-- )
|
||||
s -= L[k*astep + i]*b[k*bstep + j];
|
||||
b[i*bstep + j] = (_Tp)(s*L[i*astep + i]);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
bool Cholesky32f(float* A, size_t astep, int m, float* b, size_t bstep, int n)
|
||||
{
|
||||
return CholImpl(A, astep, m, b, bstep, n);
|
||||
}
|
||||
|
||||
bool Cholesky64f(double* A, size_t astep, int m, double* b, size_t bstep, int n)
|
||||
{
|
||||
return CholImpl(A, astep, m, b, bstep, n);
|
||||
}
|
||||
|
||||
//=============================================================================
|
||||
// for compatibility with 3.0
|
||||
|
||||
int LU(float* A, size_t astep, int m, float* b, size_t bstep, int n)
|
||||
{
|
||||
return LUImpl(A, astep, m, b, bstep, n, FLT_EPSILON*10);
|
||||
}
|
||||
|
||||
int LU(double* A, size_t astep, int m, double* b, size_t bstep, int n)
|
||||
{
|
||||
return LUImpl(A, astep, m, b, bstep, n, DBL_EPSILON*100);
|
||||
}
|
||||
|
||||
bool Cholesky(float* A, size_t astep, int m, float* b, size_t bstep, int n)
|
||||
{
|
||||
return CholImpl(A, astep, m, b, bstep, n);
|
||||
}
|
||||
|
||||
bool Cholesky(double* A, size_t astep, int m, double* b, size_t bstep, int n)
|
||||
{
|
||||
return CholImpl(A, astep, m, b, bstep, n);
|
||||
}
|
||||
|
||||
|
||||
}}
|
412
modules/core/src/merge.cpp
Normal file
412
modules/core/src/merge.cpp
Normal file
@@ -0,0 +1,412 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#include "precomp.hpp"
|
||||
|
||||
namespace cv { namespace hal {
|
||||
|
||||
#if CV_NEON
|
||||
template<typename T> struct VMerge2;
|
||||
template<typename T> struct VMerge3;
|
||||
template<typename T> struct VMerge4;
|
||||
|
||||
#define MERGE2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \
|
||||
template<> \
|
||||
struct name<data_type>{ \
|
||||
void operator()(const data_type* src0, const data_type* src1, \
|
||||
data_type* dst){ \
|
||||
reg_type r; \
|
||||
r.val[0] = load_func(src0); \
|
||||
r.val[1] = load_func(src1); \
|
||||
store_func(dst, r); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define MERGE3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \
|
||||
template<> \
|
||||
struct name<data_type>{ \
|
||||
void operator()(const data_type* src0, const data_type* src1, \
|
||||
const data_type* src2, data_type* dst){ \
|
||||
reg_type r; \
|
||||
r.val[0] = load_func(src0); \
|
||||
r.val[1] = load_func(src1); \
|
||||
r.val[2] = load_func(src2); \
|
||||
store_func(dst, r); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define MERGE4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \
|
||||
template<> \
|
||||
struct name<data_type>{ \
|
||||
void operator()(const data_type* src0, const data_type* src1, \
|
||||
const data_type* src2, const data_type* src3, \
|
||||
data_type* dst){ \
|
||||
reg_type r; \
|
||||
r.val[0] = load_func(src0); \
|
||||
r.val[1] = load_func(src1); \
|
||||
r.val[2] = load_func(src2); \
|
||||
r.val[3] = load_func(src3); \
|
||||
store_func(dst, r); \
|
||||
} \
|
||||
}
|
||||
|
||||
MERGE2_KERNEL_TEMPLATE(VMerge2, uchar , uint8x16x2_t, vld1q_u8 , vst2q_u8 );
|
||||
MERGE2_KERNEL_TEMPLATE(VMerge2, ushort, uint16x8x2_t, vld1q_u16, vst2q_u16);
|
||||
MERGE2_KERNEL_TEMPLATE(VMerge2, int , int32x4x2_t, vld1q_s32, vst2q_s32);
|
||||
MERGE2_KERNEL_TEMPLATE(VMerge2, int64 , int64x1x2_t, vld1_s64 , vst2_s64 );
|
||||
|
||||
MERGE3_KERNEL_TEMPLATE(VMerge3, uchar , uint8x16x3_t, vld1q_u8 , vst3q_u8 );
|
||||
MERGE3_KERNEL_TEMPLATE(VMerge3, ushort, uint16x8x3_t, vld1q_u16, vst3q_u16);
|
||||
MERGE3_KERNEL_TEMPLATE(VMerge3, int , int32x4x3_t, vld1q_s32, vst3q_s32);
|
||||
MERGE3_KERNEL_TEMPLATE(VMerge3, int64 , int64x1x3_t, vld1_s64 , vst3_s64 );
|
||||
|
||||
MERGE4_KERNEL_TEMPLATE(VMerge4, uchar , uint8x16x4_t, vld1q_u8 , vst4q_u8 );
|
||||
MERGE4_KERNEL_TEMPLATE(VMerge4, ushort, uint16x8x4_t, vld1q_u16, vst4q_u16);
|
||||
MERGE4_KERNEL_TEMPLATE(VMerge4, int , int32x4x4_t, vld1q_s32, vst4q_s32);
|
||||
MERGE4_KERNEL_TEMPLATE(VMerge4, int64 , int64x1x4_t, vld1_s64 , vst4_s64 );
|
||||
|
||||
#elif CV_SSE2
|
||||
|
||||
template <typename T>
|
||||
struct VMerge2
|
||||
{
|
||||
VMerge2() : support(false) { }
|
||||
void operator()(const T *, const T *, T *) const { }
|
||||
|
||||
bool support;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct VMerge3
|
||||
{
|
||||
VMerge3() : support(false) { }
|
||||
void operator()(const T *, const T *, const T *, T *) const { }
|
||||
|
||||
bool support;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct VMerge4
|
||||
{
|
||||
VMerge4() : support(false) { }
|
||||
void operator()(const T *, const T *, const T *, const T *, T *) const { }
|
||||
|
||||
bool support;
|
||||
};
|
||||
|
||||
#define MERGE2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \
|
||||
template <> \
|
||||
struct VMerge2<data_type> \
|
||||
{ \
|
||||
enum \
|
||||
{ \
|
||||
ELEMS_IN_VEC = 16 / sizeof(data_type) \
|
||||
}; \
|
||||
\
|
||||
VMerge2() \
|
||||
{ \
|
||||
support = checkHardwareSupport(se); \
|
||||
} \
|
||||
\
|
||||
void operator()(const data_type * src0, const data_type * src1, \
|
||||
data_type * dst) const \
|
||||
{ \
|
||||
reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0)); \
|
||||
reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC)); \
|
||||
reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1)); \
|
||||
reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC)); \
|
||||
\
|
||||
_mm_interleave(v_src0, v_src1, v_src2, v_src3); \
|
||||
\
|
||||
_mm_storeu_##flavor((cast_type *)(dst), v_src0); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3); \
|
||||
} \
|
||||
\
|
||||
bool support; \
|
||||
}
|
||||
|
||||
#define MERGE3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \
|
||||
template <> \
|
||||
struct VMerge3<data_type> \
|
||||
{ \
|
||||
enum \
|
||||
{ \
|
||||
ELEMS_IN_VEC = 16 / sizeof(data_type) \
|
||||
}; \
|
||||
\
|
||||
VMerge3() \
|
||||
{ \
|
||||
support = checkHardwareSupport(se); \
|
||||
} \
|
||||
\
|
||||
void operator()(const data_type * src0, const data_type * src1, const data_type * src2,\
|
||||
data_type * dst) const \
|
||||
{ \
|
||||
reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0)); \
|
||||
reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC)); \
|
||||
reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1)); \
|
||||
reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC)); \
|
||||
reg_type v_src4 = _mm_loadu_##flavor((const cast_type *)(src2)); \
|
||||
reg_type v_src5 = _mm_loadu_##flavor((const cast_type *)(src2 + ELEMS_IN_VEC)); \
|
||||
\
|
||||
_mm_interleave(v_src0, v_src1, v_src2, \
|
||||
v_src3, v_src4, v_src5); \
|
||||
\
|
||||
_mm_storeu_##flavor((cast_type *)(dst), v_src0); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 4), v_src4); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 5), v_src5); \
|
||||
} \
|
||||
\
|
||||
bool support; \
|
||||
}
|
||||
|
||||
#define MERGE4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \
|
||||
template <> \
|
||||
struct VMerge4<data_type> \
|
||||
{ \
|
||||
enum \
|
||||
{ \
|
||||
ELEMS_IN_VEC = 16 / sizeof(data_type) \
|
||||
}; \
|
||||
\
|
||||
VMerge4() \
|
||||
{ \
|
||||
support = checkHardwareSupport(se); \
|
||||
} \
|
||||
\
|
||||
void operator()(const data_type * src0, const data_type * src1, \
|
||||
const data_type * src2, const data_type * src3, \
|
||||
data_type * dst) const \
|
||||
{ \
|
||||
reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0)); \
|
||||
reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC)); \
|
||||
reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1)); \
|
||||
reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC)); \
|
||||
reg_type v_src4 = _mm_loadu_##flavor((const cast_type *)(src2)); \
|
||||
reg_type v_src5 = _mm_loadu_##flavor((const cast_type *)(src2 + ELEMS_IN_VEC)); \
|
||||
reg_type v_src6 = _mm_loadu_##flavor((const cast_type *)(src3)); \
|
||||
reg_type v_src7 = _mm_loadu_##flavor((const cast_type *)(src3 + ELEMS_IN_VEC)); \
|
||||
\
|
||||
_mm_interleave(v_src0, v_src1, v_src2, v_src3, \
|
||||
v_src4, v_src5, v_src6, v_src7); \
|
||||
\
|
||||
_mm_storeu_##flavor((cast_type *)(dst), v_src0); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 4), v_src4); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 5), v_src5); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 6), v_src6); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 7), v_src7); \
|
||||
} \
|
||||
\
|
||||
bool support; \
|
||||
}
|
||||
|
||||
MERGE2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2);
|
||||
MERGE3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2);
|
||||
MERGE4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2);
|
||||
|
||||
#if CV_SSE4_1
|
||||
MERGE2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1);
|
||||
MERGE3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1);
|
||||
MERGE4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1);
|
||||
#endif
|
||||
|
||||
MERGE2_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps, CV_CPU_SSE2);
|
||||
MERGE3_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps, CV_CPU_SSE2);
|
||||
MERGE4_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps, CV_CPU_SSE2);
|
||||
|
||||
#endif
|
||||
|
||||
template<typename T> static void
|
||||
merge_( const T** src, T* dst, int len, int cn )
|
||||
{
|
||||
int k = cn % 4 ? cn % 4 : 4;
|
||||
int i, j;
|
||||
if( k == 1 )
|
||||
{
|
||||
const T* src0 = src[0];
|
||||
for( i = j = 0; i < len; i++, j += cn )
|
||||
dst[j] = src0[i];
|
||||
}
|
||||
else if( k == 2 )
|
||||
{
|
||||
const T *src0 = src[0], *src1 = src[1];
|
||||
i = j = 0;
|
||||
#if CV_NEON
|
||||
if(cn == 2)
|
||||
{
|
||||
int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
|
||||
int inc_j = 2 * inc_i;
|
||||
|
||||
VMerge2<T> vmerge;
|
||||
for( ; i < len - inc_i; i += inc_i, j += inc_j)
|
||||
vmerge(src0 + i, src1 + i, dst + j);
|
||||
}
|
||||
#elif CV_SSE2
|
||||
if(cn == 2)
|
||||
{
|
||||
int inc_i = 32/sizeof(T);
|
||||
int inc_j = 2 * inc_i;
|
||||
|
||||
VMerge2<T> vmerge;
|
||||
if (vmerge.support)
|
||||
for( ; i < len - inc_i; i += inc_i, j += inc_j)
|
||||
vmerge(src0 + i, src1 + i, dst + j);
|
||||
}
|
||||
#endif
|
||||
for( ; i < len; i++, j += cn )
|
||||
{
|
||||
dst[j] = src0[i];
|
||||
dst[j+1] = src1[i];
|
||||
}
|
||||
}
|
||||
else if( k == 3 )
|
||||
{
|
||||
const T *src0 = src[0], *src1 = src[1], *src2 = src[2];
|
||||
i = j = 0;
|
||||
#if CV_NEON
|
||||
if(cn == 3)
|
||||
{
|
||||
int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
|
||||
int inc_j = 3 * inc_i;
|
||||
|
||||
VMerge3<T> vmerge;
|
||||
for( ; i < len - inc_i; i += inc_i, j += inc_j)
|
||||
vmerge(src0 + i, src1 + i, src2 + i, dst + j);
|
||||
}
|
||||
#elif CV_SSE2
|
||||
if(cn == 3)
|
||||
{
|
||||
int inc_i = 32/sizeof(T);
|
||||
int inc_j = 3 * inc_i;
|
||||
|
||||
VMerge3<T> vmerge;
|
||||
if (vmerge.support)
|
||||
for( ; i < len - inc_i; i += inc_i, j += inc_j)
|
||||
vmerge(src0 + i, src1 + i, src2 + i, dst + j);
|
||||
}
|
||||
#endif
|
||||
for( ; i < len; i++, j += cn )
|
||||
{
|
||||
dst[j] = src0[i];
|
||||
dst[j+1] = src1[i];
|
||||
dst[j+2] = src2[i];
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
const T *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3];
|
||||
i = j = 0;
|
||||
#if CV_NEON
|
||||
if(cn == 4)
|
||||
{
|
||||
int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
|
||||
int inc_j = 4 * inc_i;
|
||||
|
||||
VMerge4<T> vmerge;
|
||||
for( ; i < len - inc_i; i += inc_i, j += inc_j)
|
||||
vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j);
|
||||
}
|
||||
#elif CV_SSE2
|
||||
if(cn == 4)
|
||||
{
|
||||
int inc_i = 32/sizeof(T);
|
||||
int inc_j = 4 * inc_i;
|
||||
|
||||
VMerge4<T> vmerge;
|
||||
if (vmerge.support)
|
||||
for( ; i < len - inc_i; i += inc_i, j += inc_j)
|
||||
vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j);
|
||||
}
|
||||
#endif
|
||||
for( ; i < len; i++, j += cn )
|
||||
{
|
||||
dst[j] = src0[i]; dst[j+1] = src1[i];
|
||||
dst[j+2] = src2[i]; dst[j+3] = src3[i];
|
||||
}
|
||||
}
|
||||
|
||||
for( ; k < cn; k += 4 )
|
||||
{
|
||||
const T *src0 = src[k], *src1 = src[k+1], *src2 = src[k+2], *src3 = src[k+3];
|
||||
for( i = 0, j = k; i < len; i++, j += cn )
|
||||
{
|
||||
dst[j] = src0[i]; dst[j+1] = src1[i];
|
||||
dst[j+2] = src2[i]; dst[j+3] = src3[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void merge8u(const uchar** src, uchar* dst, int len, int cn )
|
||||
{
|
||||
CALL_HAL(merge8u, cv_hal_merge8u, src, dst, len, cn)
|
||||
merge_(src, dst, len, cn);
|
||||
}
|
||||
|
||||
void merge16u(const ushort** src, ushort* dst, int len, int cn )
|
||||
{
|
||||
CALL_HAL(merge16u, cv_hal_merge16u, src, dst, len, cn)
|
||||
merge_(src, dst, len, cn);
|
||||
}
|
||||
|
||||
void merge32s(const int** src, int* dst, int len, int cn )
|
||||
{
|
||||
CALL_HAL(merge32s, cv_hal_merge32s, src, dst, len, cn)
|
||||
merge_(src, dst, len, cn);
|
||||
}
|
||||
|
||||
void merge64s(const int64** src, int64* dst, int len, int cn )
|
||||
{
|
||||
CALL_HAL(merge64s, cv_hal_merge64s, src, dst, len, cn)
|
||||
merge_(src, dst, len, cn);
|
||||
}
|
||||
|
||||
}}
|
@@ -58,8 +58,6 @@
|
||||
#include "opencv2/core/ocl.hpp"
|
||||
#endif
|
||||
|
||||
#include "opencv2/hal.hpp"
|
||||
|
||||
#include <assert.h>
|
||||
#include <ctype.h>
|
||||
#include <float.h>
|
||||
@@ -69,6 +67,27 @@
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <cstdlib>
|
||||
#include <limits>
|
||||
#include <float.h>
|
||||
#include <cstring>
|
||||
#include <cassert>
|
||||
|
||||
#define USE_SSE2 (cv::checkHardwareSupport(CV_CPU_SSE))
|
||||
#define USE_SSE4_2 (cv::checkHardwareSupport(CV_CPU_SSE4_2))
|
||||
#define USE_AVX (cv::checkHardwareSupport(CV_CPU_AVX))
|
||||
#define USE_AVX2 (cv::checkHardwareSupport(CV_CPU_AVX2))
|
||||
|
||||
#include "opencv2/core/hal/hal.hpp"
|
||||
#include "opencv2/core/hal/intrin.hpp"
|
||||
#include "opencv2/core/sse_utils.hpp"
|
||||
#include "opencv2/core/neon_utils.hpp"
|
||||
|
||||
#include "arithm_core.hpp"
|
||||
#include "hal_replacement.hpp"
|
||||
|
||||
#ifdef HAVE_TEGRA_OPTIMIZATION
|
||||
#include "opencv2/core/core_tegra.hpp"
|
||||
#else
|
||||
@@ -78,6 +97,34 @@
|
||||
namespace cv
|
||||
{
|
||||
|
||||
// -128.f ... 255.f
|
||||
extern const float g_8x32fTab[];
|
||||
#define CV_8TO32F(x) cv::g_8x32fTab[(x)+128]
|
||||
|
||||
extern const ushort g_8x16uSqrTab[];
|
||||
#define CV_SQR_8U(x) cv::g_8x16uSqrTab[(x)+255]
|
||||
|
||||
extern const uchar g_Saturate8u[];
|
||||
#define CV_FAST_CAST_8U(t) (assert(-256 <= (t) && (t) <= 512), cv::g_Saturate8u[(t)+256])
|
||||
#define CV_MIN_8U(a,b) ((a) - CV_FAST_CAST_8U((a) - (b)))
|
||||
#define CV_MAX_8U(a,b) ((a) + CV_FAST_CAST_8U((b) - (a)))
|
||||
|
||||
template<> inline uchar OpAdd<uchar>::operator ()(uchar a, uchar b) const
|
||||
{ return CV_FAST_CAST_8U(a + b); }
|
||||
|
||||
template<> inline uchar OpSub<uchar>::operator ()(uchar a, uchar b) const
|
||||
{ return CV_FAST_CAST_8U(a - b); }
|
||||
|
||||
template<> inline short OpAbsDiff<short>::operator ()(short a, short b) const
|
||||
{ return saturate_cast<short>(std::abs(a - b)); }
|
||||
|
||||
template<> inline schar OpAbsDiff<schar>::operator ()(schar a, schar b) const
|
||||
{ return saturate_cast<schar>(std::abs(a - b)); }
|
||||
|
||||
template<> inline uchar OpMin<uchar>::operator ()(uchar a, uchar b) const { return CV_MIN_8U(a, b); }
|
||||
|
||||
template<> inline uchar OpMax<uchar>::operator ()(uchar a, uchar b) const { return CV_MAX_8U(a, b); }
|
||||
|
||||
typedef void (*BinaryFunc)(const uchar* src1, size_t step1,
|
||||
const uchar* src2, size_t step2,
|
||||
uchar* dst, size_t step, Size sz,
|
||||
@@ -100,21 +147,6 @@ BinaryFunc getCopyMaskFunc(size_t esz);
|
||||
/* maximal average node_count/hash_size ratio beyond which hash table is resized */
|
||||
#define CV_SPARSE_HASH_RATIO 3
|
||||
|
||||
|
||||
|
||||
// -128.f ... 255.f
|
||||
extern const float g_8x32fTab[];
|
||||
#define CV_8TO32F(x) cv::g_8x32fTab[(x)+128]
|
||||
|
||||
extern const ushort g_8x16uSqrTab[];
|
||||
#define CV_SQR_8U(x) cv::g_8x16uSqrTab[(x)+255]
|
||||
|
||||
extern const uchar g_Saturate8u[];
|
||||
#define CV_FAST_CAST_8U(t) (assert(-256 <= (t) && (t) <= 512), cv::g_Saturate8u[(t)+256])
|
||||
#define CV_MIN_8U(a,b) ((a) - CV_FAST_CAST_8U((a) - (b)))
|
||||
#define CV_MAX_8U(a,b) ((a) + CV_FAST_CAST_8U((b) - (a)))
|
||||
|
||||
|
||||
#if defined WIN32 || defined _WIN32
|
||||
void deleteThreadAllocData();
|
||||
#endif
|
||||
@@ -282,6 +314,4 @@ cv::Mutex& getInitializationMutex();
|
||||
|
||||
}
|
||||
|
||||
#include "opencv2/hal/intrin.hpp"
|
||||
|
||||
#endif /*_CXCORE_INTERNAL_H_*/
|
||||
|
428
modules/core/src/split.cpp
Normal file
428
modules/core/src/split.cpp
Normal file
@@ -0,0 +1,428 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#include "precomp.hpp"
|
||||
|
||||
namespace cv { namespace hal {
|
||||
|
||||
#if CV_NEON
|
||||
template<typename T> struct VSplit2;
|
||||
template<typename T> struct VSplit3;
|
||||
template<typename T> struct VSplit4;
|
||||
|
||||
#define SPLIT2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \
|
||||
template<> \
|
||||
struct name<data_type> \
|
||||
{ \
|
||||
void operator()(const data_type* src, data_type* dst0, \
|
||||
data_type* dst1) const \
|
||||
{ \
|
||||
reg_type r = load_func(src); \
|
||||
store_func(dst0, r.val[0]); \
|
||||
store_func(dst1, r.val[1]); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define SPLIT3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \
|
||||
template<> \
|
||||
struct name<data_type> \
|
||||
{ \
|
||||
void operator()(const data_type* src, data_type* dst0, data_type* dst1, \
|
||||
data_type* dst2) const \
|
||||
{ \
|
||||
reg_type r = load_func(src); \
|
||||
store_func(dst0, r.val[0]); \
|
||||
store_func(dst1, r.val[1]); \
|
||||
store_func(dst2, r.val[2]); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define SPLIT4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \
|
||||
template<> \
|
||||
struct name<data_type> \
|
||||
{ \
|
||||
void operator()(const data_type* src, data_type* dst0, data_type* dst1, \
|
||||
data_type* dst2, data_type* dst3) const \
|
||||
{ \
|
||||
reg_type r = load_func(src); \
|
||||
store_func(dst0, r.val[0]); \
|
||||
store_func(dst1, r.val[1]); \
|
||||
store_func(dst2, r.val[2]); \
|
||||
store_func(dst3, r.val[3]); \
|
||||
} \
|
||||
}
|
||||
|
||||
SPLIT2_KERNEL_TEMPLATE(VSplit2, uchar , uint8x16x2_t, vld2q_u8 , vst1q_u8 );
|
||||
SPLIT2_KERNEL_TEMPLATE(VSplit2, ushort, uint16x8x2_t, vld2q_u16, vst1q_u16);
|
||||
SPLIT2_KERNEL_TEMPLATE(VSplit2, int , int32x4x2_t, vld2q_s32, vst1q_s32);
|
||||
SPLIT2_KERNEL_TEMPLATE(VSplit2, int64 , int64x1x2_t, vld2_s64 , vst1_s64 );
|
||||
|
||||
SPLIT3_KERNEL_TEMPLATE(VSplit3, uchar , uint8x16x3_t, vld3q_u8 , vst1q_u8 );
|
||||
SPLIT3_KERNEL_TEMPLATE(VSplit3, ushort, uint16x8x3_t, vld3q_u16, vst1q_u16);
|
||||
SPLIT3_KERNEL_TEMPLATE(VSplit3, int , int32x4x3_t, vld3q_s32, vst1q_s32);
|
||||
SPLIT3_KERNEL_TEMPLATE(VSplit3, int64 , int64x1x3_t, vld3_s64 , vst1_s64 );
|
||||
|
||||
SPLIT4_KERNEL_TEMPLATE(VSplit4, uchar , uint8x16x4_t, vld4q_u8 , vst1q_u8 );
|
||||
SPLIT4_KERNEL_TEMPLATE(VSplit4, ushort, uint16x8x4_t, vld4q_u16, vst1q_u16);
|
||||
SPLIT4_KERNEL_TEMPLATE(VSplit4, int , int32x4x4_t, vld4q_s32, vst1q_s32);
|
||||
SPLIT4_KERNEL_TEMPLATE(VSplit4, int64 , int64x1x4_t, vld4_s64 , vst1_s64 );
|
||||
|
||||
#elif CV_SSE2
|
||||
|
||||
template <typename T>
|
||||
struct VSplit2
|
||||
{
|
||||
VSplit2() : support(false) { }
|
||||
void operator()(const T *, T *, T *) const { }
|
||||
|
||||
bool support;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct VSplit3
|
||||
{
|
||||
VSplit3() : support(false) { }
|
||||
void operator()(const T *, T *, T *, T *) const { }
|
||||
|
||||
bool support;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct VSplit4
|
||||
{
|
||||
VSplit4() : support(false) { }
|
||||
void operator()(const T *, T *, T *, T *, T *) const { }
|
||||
|
||||
bool support;
|
||||
};
|
||||
|
||||
#define SPLIT2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \
|
||||
template <> \
|
||||
struct VSplit2<data_type> \
|
||||
{ \
|
||||
enum \
|
||||
{ \
|
||||
ELEMS_IN_VEC = 16 / sizeof(data_type) \
|
||||
}; \
|
||||
\
|
||||
VSplit2() \
|
||||
{ \
|
||||
support = checkHardwareSupport(CV_CPU_SSE2); \
|
||||
} \
|
||||
\
|
||||
void operator()(const data_type * src, \
|
||||
data_type * dst0, data_type * dst1) const \
|
||||
{ \
|
||||
reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \
|
||||
reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \
|
||||
reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
|
||||
reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
|
||||
\
|
||||
_mm_deinterleave(v_src0, v_src1, v_src2, v_src3); \
|
||||
\
|
||||
_mm_storeu_##flavor((cast_type *)(dst0), v_src0); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst1), v_src2); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \
|
||||
} \
|
||||
\
|
||||
bool support; \
|
||||
}
|
||||
|
||||
#define SPLIT3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \
|
||||
template <> \
|
||||
struct VSplit3<data_type> \
|
||||
{ \
|
||||
enum \
|
||||
{ \
|
||||
ELEMS_IN_VEC = 16 / sizeof(data_type) \
|
||||
}; \
|
||||
\
|
||||
VSplit3() \
|
||||
{ \
|
||||
support = checkHardwareSupport(CV_CPU_SSE2); \
|
||||
} \
|
||||
\
|
||||
void operator()(const data_type * src, \
|
||||
data_type * dst0, data_type * dst1, data_type * dst2) const \
|
||||
{ \
|
||||
reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \
|
||||
reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \
|
||||
reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
|
||||
reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
|
||||
reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \
|
||||
reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \
|
||||
\
|
||||
_mm_deinterleave(v_src0, v_src1, v_src2, \
|
||||
v_src3, v_src4, v_src5); \
|
||||
\
|
||||
_mm_storeu_##flavor((cast_type *)(dst0), v_src0); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst1), v_src2); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst2), v_src4); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5); \
|
||||
} \
|
||||
\
|
||||
bool support; \
|
||||
}
|
||||
|
||||
#define SPLIT4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \
|
||||
template <> \
|
||||
struct VSplit4<data_type> \
|
||||
{ \
|
||||
enum \
|
||||
{ \
|
||||
ELEMS_IN_VEC = 16 / sizeof(data_type) \
|
||||
}; \
|
||||
\
|
||||
VSplit4() \
|
||||
{ \
|
||||
support = checkHardwareSupport(CV_CPU_SSE2); \
|
||||
} \
|
||||
\
|
||||
void operator()(const data_type * src, data_type * dst0, data_type * dst1, \
|
||||
data_type * dst2, data_type * dst3) const \
|
||||
{ \
|
||||
reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \
|
||||
reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \
|
||||
reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
|
||||
reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
|
||||
reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \
|
||||
reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \
|
||||
reg_type v_src6 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 6)); \
|
||||
reg_type v_src7 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 7)); \
|
||||
\
|
||||
_mm_deinterleave(v_src0, v_src1, v_src2, v_src3, \
|
||||
v_src4, v_src5, v_src6, v_src7); \
|
||||
\
|
||||
_mm_storeu_##flavor((cast_type *)(dst0), v_src0); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst1), v_src2); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst2), v_src4); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst3), v_src6); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst3 + ELEMS_IN_VEC), v_src7); \
|
||||
} \
|
||||
\
|
||||
bool support; \
|
||||
}
|
||||
|
||||
SPLIT2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128);
|
||||
SPLIT2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128);
|
||||
SPLIT2_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps);
|
||||
|
||||
SPLIT3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128);
|
||||
SPLIT3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128);
|
||||
SPLIT3_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps);
|
||||
|
||||
SPLIT4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128);
|
||||
SPLIT4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128);
|
||||
SPLIT4_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps);
|
||||
|
||||
#endif
|
||||
|
||||
template<typename T> static void
|
||||
split_( const T* src, T** dst, int len, int cn )
|
||||
{
|
||||
int k = cn % 4 ? cn % 4 : 4;
|
||||
int i, j;
|
||||
if( k == 1 )
|
||||
{
|
||||
T* dst0 = dst[0];
|
||||
|
||||
if(cn == 1)
|
||||
{
|
||||
memcpy(dst0, src, len * sizeof(T));
|
||||
}
|
||||
else
|
||||
{
|
||||
for( i = 0, j = 0 ; i < len; i++, j += cn )
|
||||
dst0[i] = src[j];
|
||||
}
|
||||
}
|
||||
else if( k == 2 )
|
||||
{
|
||||
T *dst0 = dst[0], *dst1 = dst[1];
|
||||
i = j = 0;
|
||||
|
||||
#if CV_NEON
|
||||
if(cn == 2)
|
||||
{
|
||||
int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
|
||||
int inc_j = 2 * inc_i;
|
||||
|
||||
VSplit2<T> vsplit;
|
||||
for( ; i < len - inc_i; i += inc_i, j += inc_j)
|
||||
vsplit(src + j, dst0 + i, dst1 + i);
|
||||
}
|
||||
#elif CV_SSE2
|
||||
if (cn == 2)
|
||||
{
|
||||
int inc_i = 32/sizeof(T);
|
||||
int inc_j = 2 * inc_i;
|
||||
|
||||
VSplit2<T> vsplit;
|
||||
if (vsplit.support)
|
||||
{
|
||||
for( ; i <= len - inc_i; i += inc_i, j += inc_j)
|
||||
vsplit(src + j, dst0 + i, dst1 + i);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
for( ; i < len; i++, j += cn )
|
||||
{
|
||||
dst0[i] = src[j];
|
||||
dst1[i] = src[j+1];
|
||||
}
|
||||
}
|
||||
else if( k == 3 )
|
||||
{
|
||||
T *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2];
|
||||
i = j = 0;
|
||||
|
||||
#if CV_NEON
|
||||
if(cn == 3)
|
||||
{
|
||||
int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
|
||||
int inc_j = 3 * inc_i;
|
||||
|
||||
VSplit3<T> vsplit;
|
||||
for( ; i <= len - inc_i; i += inc_i, j += inc_j)
|
||||
vsplit(src + j, dst0 + i, dst1 + i, dst2 + i);
|
||||
}
|
||||
#elif CV_SSE2
|
||||
if (cn == 3)
|
||||
{
|
||||
int inc_i = 32/sizeof(T);
|
||||
int inc_j = 3 * inc_i;
|
||||
|
||||
VSplit3<T> vsplit;
|
||||
|
||||
if (vsplit.support)
|
||||
{
|
||||
for( ; i <= len - inc_i; i += inc_i, j += inc_j)
|
||||
vsplit(src + j, dst0 + i, dst1 + i, dst2 + i);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
for( ; i < len; i++, j += cn )
|
||||
{
|
||||
dst0[i] = src[j];
|
||||
dst1[i] = src[j+1];
|
||||
dst2[i] = src[j+2];
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
T *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2], *dst3 = dst[3];
|
||||
i = j = 0;
|
||||
|
||||
#if CV_NEON
|
||||
if(cn == 4)
|
||||
{
|
||||
int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
|
||||
int inc_j = 4 * inc_i;
|
||||
|
||||
VSplit4<T> vsplit;
|
||||
for( ; i <= len - inc_i; i += inc_i, j += inc_j)
|
||||
vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i);
|
||||
}
|
||||
#elif CV_SSE2
|
||||
if (cn == 4)
|
||||
{
|
||||
int inc_i = 32/sizeof(T);
|
||||
int inc_j = 4 * inc_i;
|
||||
|
||||
VSplit4<T> vsplit;
|
||||
if (vsplit.support)
|
||||
{
|
||||
for( ; i <= len - inc_i; i += inc_i, j += inc_j)
|
||||
vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
for( ; i < len; i++, j += cn )
|
||||
{
|
||||
dst0[i] = src[j]; dst1[i] = src[j+1];
|
||||
dst2[i] = src[j+2]; dst3[i] = src[j+3];
|
||||
}
|
||||
}
|
||||
|
||||
for( ; k < cn; k += 4 )
|
||||
{
|
||||
T *dst0 = dst[k], *dst1 = dst[k+1], *dst2 = dst[k+2], *dst3 = dst[k+3];
|
||||
for( i = 0, j = k; i < len; i++, j += cn )
|
||||
{
|
||||
dst0[i] = src[j]; dst1[i] = src[j+1];
|
||||
dst2[i] = src[j+2]; dst3[i] = src[j+3];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void split8u(const uchar* src, uchar** dst, int len, int cn )
|
||||
{
|
||||
CALL_HAL(split8u, cv_hal_split8u, src,dst, len, cn)
|
||||
split_(src, dst, len, cn);
|
||||
}
|
||||
|
||||
void split16u(const ushort* src, ushort** dst, int len, int cn )
|
||||
{
|
||||
CALL_HAL(split16u, cv_hal_split16u, src,dst, len, cn)
|
||||
split_(src, dst, len, cn);
|
||||
}
|
||||
|
||||
void split32s(const int* src, int** dst, int len, int cn )
|
||||
{
|
||||
CALL_HAL(split32s, cv_hal_split32s, src,dst, len, cn)
|
||||
split_(src, dst, len, cn);
|
||||
}
|
||||
|
||||
void split64s(const int64* src, int64** dst, int len, int cn )
|
||||
{
|
||||
CALL_HAL(split64s, cv_hal_split64s, src,dst, len, cn)
|
||||
split_(src, dst, len, cn);
|
||||
}
|
||||
|
||||
}}
|
@@ -3996,3 +3996,266 @@ cvNorm( const void* imgA, const void* imgB, int normType, const void* maskarr )
|
||||
|
||||
return !maskarr ? cv::norm(a, b, normType) : cv::norm(a, b, normType, mask);
|
||||
}
|
||||
|
||||
namespace cv { namespace hal {
|
||||
|
||||
static const uchar popCountTable[] =
|
||||
{
|
||||
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
||||
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
||||
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
||||
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
||||
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
|
||||
};
|
||||
|
||||
static const uchar popCountTable2[] =
|
||||
{
|
||||
0, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3,
|
||||
1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3,
|
||||
1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
|
||||
2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
|
||||
1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
|
||||
2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
|
||||
1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
|
||||
2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4
|
||||
};
|
||||
|
||||
static const uchar popCountTable4[] =
|
||||
{
|
||||
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||||
1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||||
1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||||
1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||||
1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||||
1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||||
1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||||
1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
|
||||
};
|
||||
|
||||
int normHamming(const uchar* a, int n)
|
||||
{
|
||||
int i = 0;
|
||||
int result = 0;
|
||||
#if CV_NEON
|
||||
{
|
||||
uint32x4_t bits = vmovq_n_u32(0);
|
||||
for (; i <= n - 16; i += 16) {
|
||||
uint8x16_t A_vec = vld1q_u8 (a + i);
|
||||
uint8x16_t bitsSet = vcntq_u8 (A_vec);
|
||||
uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
|
||||
uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
|
||||
bits = vaddq_u32(bits, bitSet4);
|
||||
}
|
||||
uint64x2_t bitSet2 = vpaddlq_u32 (bits);
|
||||
result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0);
|
||||
result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2);
|
||||
}
|
||||
#endif
|
||||
for( ; i <= n - 4; i += 4 )
|
||||
result += popCountTable[a[i]] + popCountTable[a[i+1]] +
|
||||
popCountTable[a[i+2]] + popCountTable[a[i+3]];
|
||||
for( ; i < n; i++ )
|
||||
result += popCountTable[a[i]];
|
||||
return result;
|
||||
}
|
||||
|
||||
int normHamming(const uchar* a, const uchar* b, int n)
|
||||
{
|
||||
int i = 0;
|
||||
int result = 0;
|
||||
#if CV_NEON
|
||||
{
|
||||
uint32x4_t bits = vmovq_n_u32(0);
|
||||
for (; i <= n - 16; i += 16) {
|
||||
uint8x16_t A_vec = vld1q_u8 (a + i);
|
||||
uint8x16_t B_vec = vld1q_u8 (b + i);
|
||||
uint8x16_t AxorB = veorq_u8 (A_vec, B_vec);
|
||||
uint8x16_t bitsSet = vcntq_u8 (AxorB);
|
||||
uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
|
||||
uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
|
||||
bits = vaddq_u32(bits, bitSet4);
|
||||
}
|
||||
uint64x2_t bitSet2 = vpaddlq_u32 (bits);
|
||||
result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0);
|
||||
result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2);
|
||||
}
|
||||
#endif
|
||||
for( ; i <= n - 4; i += 4 )
|
||||
result += popCountTable[a[i] ^ b[i]] + popCountTable[a[i+1] ^ b[i+1]] +
|
||||
popCountTable[a[i+2] ^ b[i+2]] + popCountTable[a[i+3] ^ b[i+3]];
|
||||
for( ; i < n; i++ )
|
||||
result += popCountTable[a[i] ^ b[i]];
|
||||
return result;
|
||||
}
|
||||
|
||||
int normHamming(const uchar* a, int n, int cellSize)
|
||||
{
|
||||
if( cellSize == 1 )
|
||||
return normHamming(a, n);
|
||||
const uchar* tab = 0;
|
||||
if( cellSize == 2 )
|
||||
tab = popCountTable2;
|
||||
else if( cellSize == 4 )
|
||||
tab = popCountTable4;
|
||||
else
|
||||
return -1;
|
||||
int i = 0;
|
||||
int result = 0;
|
||||
#if CV_ENABLE_UNROLLED
|
||||
for( ; i <= n - 4; i += 4 )
|
||||
result += tab[a[i]] + tab[a[i+1]] + tab[a[i+2]] + tab[a[i+3]];
|
||||
#endif
|
||||
for( ; i < n; i++ )
|
||||
result += tab[a[i]];
|
||||
return result;
|
||||
}
|
||||
|
||||
int normHamming(const uchar* a, const uchar* b, int n, int cellSize)
|
||||
{
|
||||
if( cellSize == 1 )
|
||||
return normHamming(a, b, n);
|
||||
const uchar* tab = 0;
|
||||
if( cellSize == 2 )
|
||||
tab = popCountTable2;
|
||||
else if( cellSize == 4 )
|
||||
tab = popCountTable4;
|
||||
else
|
||||
return -1;
|
||||
int i = 0;
|
||||
int result = 0;
|
||||
#if CV_ENABLE_UNROLLED
|
||||
for( ; i <= n - 4; i += 4 )
|
||||
result += tab[a[i] ^ b[i]] + tab[a[i+1] ^ b[i+1]] +
|
||||
tab[a[i+2] ^ b[i+2]] + tab[a[i+3] ^ b[i+3]];
|
||||
#endif
|
||||
for( ; i < n; i++ )
|
||||
result += tab[a[i] ^ b[i]];
|
||||
return result;
|
||||
}
|
||||
|
||||
float normL2Sqr_(const float* a, const float* b, int n)
|
||||
{
|
||||
int j = 0; float d = 0.f;
|
||||
#if CV_SSE
|
||||
float CV_DECL_ALIGNED(16) buf[4];
|
||||
__m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
|
||||
|
||||
for( ; j <= n - 8; j += 8 )
|
||||
{
|
||||
__m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
|
||||
__m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
|
||||
d0 = _mm_add_ps(d0, _mm_mul_ps(t0, t0));
|
||||
d1 = _mm_add_ps(d1, _mm_mul_ps(t1, t1));
|
||||
}
|
||||
_mm_store_ps(buf, _mm_add_ps(d0, d1));
|
||||
d = buf[0] + buf[1] + buf[2] + buf[3];
|
||||
#endif
|
||||
{
|
||||
for( ; j <= n - 4; j += 4 )
|
||||
{
|
||||
float t0 = a[j] - b[j], t1 = a[j+1] - b[j+1], t2 = a[j+2] - b[j+2], t3 = a[j+3] - b[j+3];
|
||||
d += t0*t0 + t1*t1 + t2*t2 + t3*t3;
|
||||
}
|
||||
}
|
||||
|
||||
for( ; j < n; j++ )
|
||||
{
|
||||
float t = a[j] - b[j];
|
||||
d += t*t;
|
||||
}
|
||||
return d;
|
||||
}
|
||||
|
||||
|
||||
float normL1_(const float* a, const float* b, int n)
|
||||
{
|
||||
int j = 0; float d = 0.f;
|
||||
#if CV_SSE
|
||||
float CV_DECL_ALIGNED(16) buf[4];
|
||||
static const int CV_DECL_ALIGNED(16) absbuf[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
|
||||
__m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
|
||||
__m128 absmask = _mm_load_ps((const float*)absbuf);
|
||||
|
||||
for( ; j <= n - 8; j += 8 )
|
||||
{
|
||||
__m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
|
||||
__m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
|
||||
d0 = _mm_add_ps(d0, _mm_and_ps(t0, absmask));
|
||||
d1 = _mm_add_ps(d1, _mm_and_ps(t1, absmask));
|
||||
}
|
||||
_mm_store_ps(buf, _mm_add_ps(d0, d1));
|
||||
d = buf[0] + buf[1] + buf[2] + buf[3];
|
||||
#elif CV_NEON
|
||||
float32x4_t v_sum = vdupq_n_f32(0.0f);
|
||||
for ( ; j <= n - 4; j += 4)
|
||||
v_sum = vaddq_f32(v_sum, vabdq_f32(vld1q_f32(a + j), vld1q_f32(b + j)));
|
||||
|
||||
float CV_DECL_ALIGNED(16) buf[4];
|
||||
vst1q_f32(buf, v_sum);
|
||||
d = buf[0] + buf[1] + buf[2] + buf[3];
|
||||
#endif
|
||||
{
|
||||
for( ; j <= n - 4; j += 4 )
|
||||
{
|
||||
d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
|
||||
std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
|
||||
}
|
||||
}
|
||||
|
||||
for( ; j < n; j++ )
|
||||
d += std::abs(a[j] - b[j]);
|
||||
return d;
|
||||
}
|
||||
|
||||
int normL1_(const uchar* a, const uchar* b, int n)
|
||||
{
|
||||
int j = 0, d = 0;
|
||||
#if CV_SSE
|
||||
__m128i d0 = _mm_setzero_si128();
|
||||
|
||||
for( ; j <= n - 16; j += 16 )
|
||||
{
|
||||
__m128i t0 = _mm_loadu_si128((const __m128i*)(a + j));
|
||||
__m128i t1 = _mm_loadu_si128((const __m128i*)(b + j));
|
||||
|
||||
d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
|
||||
}
|
||||
|
||||
for( ; j <= n - 4; j += 4 )
|
||||
{
|
||||
__m128i t0 = _mm_cvtsi32_si128(*(const int*)(a + j));
|
||||
__m128i t1 = _mm_cvtsi32_si128(*(const int*)(b + j));
|
||||
|
||||
d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
|
||||
}
|
||||
d = _mm_cvtsi128_si32(_mm_add_epi32(d0, _mm_unpackhi_epi64(d0, d0)));
|
||||
#elif CV_NEON
|
||||
uint32x4_t v_sum = vdupq_n_u32(0.0f);
|
||||
for ( ; j <= n - 16; j += 16)
|
||||
{
|
||||
uint8x16_t v_dst = vabdq_u8(vld1q_u8(a + j), vld1q_u8(b + j));
|
||||
uint16x8_t v_low = vmovl_u8(vget_low_u8(v_dst)), v_high = vmovl_u8(vget_high_u8(v_dst));
|
||||
v_sum = vaddq_u32(v_sum, vaddl_u16(vget_low_u16(v_low), vget_low_u16(v_high)));
|
||||
v_sum = vaddq_u32(v_sum, vaddl_u16(vget_high_u16(v_low), vget_high_u16(v_high)));
|
||||
}
|
||||
|
||||
uint CV_DECL_ALIGNED(16) buf[4];
|
||||
vst1q_u32(buf, v_sum);
|
||||
d = buf[0] + buf[1] + buf[2] + buf[3];
|
||||
#endif
|
||||
{
|
||||
for( ; j <= n - 4; j += 4 )
|
||||
{
|
||||
d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
|
||||
std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
|
||||
}
|
||||
}
|
||||
for( ; j < n; j++ )
|
||||
d += std::abs(a[j] - b[j]);
|
||||
return d;
|
||||
}
|
||||
|
||||
}} //cv::hal
|
||||
|
@@ -86,6 +86,45 @@ Mutex* __initialization_mutex_initializer = &getInitializationMutex();
|
||||
#undef max
|
||||
#undef abs
|
||||
#include <tchar.h>
|
||||
#if defined _MSC_VER
|
||||
#if _MSC_VER >= 1400
|
||||
#include <intrin.h>
|
||||
#elif defined _M_IX86
|
||||
static void __cpuid(int* cpuid_data, int)
|
||||
{
|
||||
__asm
|
||||
{
|
||||
push ebx
|
||||
push edi
|
||||
mov edi, cpuid_data
|
||||
mov eax, 1
|
||||
cpuid
|
||||
mov [edi], eax
|
||||
mov [edi + 4], ebx
|
||||
mov [edi + 8], ecx
|
||||
mov [edi + 12], edx
|
||||
pop edi
|
||||
pop ebx
|
||||
}
|
||||
}
|
||||
static void __cpuidex(int* cpuid_data, int, int)
|
||||
{
|
||||
__asm
|
||||
{
|
||||
push edi
|
||||
mov edi, cpuid_data
|
||||
mov eax, 7
|
||||
mov ecx, 0
|
||||
cpuid
|
||||
mov [edi], eax
|
||||
mov [edi + 4], ebx
|
||||
mov [edi + 8], ecx
|
||||
mov [edi + 12], edx
|
||||
pop edi
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef WINRT
|
||||
#include <wrl/client.h>
|
||||
@@ -198,15 +237,154 @@ void Exception::formatMessage()
|
||||
msg = format("%s:%d: error: (%d) %s\n", file.c_str(), line, code, err.c_str());
|
||||
}
|
||||
|
||||
struct HWFeatures
|
||||
{
|
||||
enum { MAX_FEATURE = CV_HARDWARE_MAX_FEATURE };
|
||||
|
||||
HWFeatures(void)
|
||||
{
|
||||
memset( have, 0, sizeof(have) );
|
||||
x86_family = 0;
|
||||
}
|
||||
|
||||
static HWFeatures initialize(void)
|
||||
{
|
||||
HWFeatures f;
|
||||
int cpuid_data[4] = { 0, 0, 0, 0 };
|
||||
|
||||
#if defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
|
||||
__cpuid(cpuid_data, 1);
|
||||
#elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
|
||||
#ifdef __x86_64__
|
||||
asm __volatile__
|
||||
(
|
||||
"movl $1, %%eax\n\t"
|
||||
"cpuid\n\t"
|
||||
:[eax]"=a"(cpuid_data[0]),[ebx]"=b"(cpuid_data[1]),[ecx]"=c"(cpuid_data[2]),[edx]"=d"(cpuid_data[3])
|
||||
:
|
||||
: "cc"
|
||||
);
|
||||
#else
|
||||
asm volatile
|
||||
(
|
||||
"pushl %%ebx\n\t"
|
||||
"movl $1,%%eax\n\t"
|
||||
"cpuid\n\t"
|
||||
"popl %%ebx\n\t"
|
||||
: "=a"(cpuid_data[0]), "=c"(cpuid_data[2]), "=d"(cpuid_data[3])
|
||||
:
|
||||
: "cc"
|
||||
);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
f.x86_family = (cpuid_data[0] >> 8) & 15;
|
||||
if( f.x86_family >= 6 )
|
||||
{
|
||||
f.have[CV_CPU_MMX] = (cpuid_data[3] & (1 << 23)) != 0;
|
||||
f.have[CV_CPU_SSE] = (cpuid_data[3] & (1<<25)) != 0;
|
||||
f.have[CV_CPU_SSE2] = (cpuid_data[3] & (1<<26)) != 0;
|
||||
f.have[CV_CPU_SSE3] = (cpuid_data[2] & (1<<0)) != 0;
|
||||
f.have[CV_CPU_SSSE3] = (cpuid_data[2] & (1<<9)) != 0;
|
||||
f.have[CV_CPU_FMA3] = (cpuid_data[2] & (1<<12)) != 0;
|
||||
f.have[CV_CPU_SSE4_1] = (cpuid_data[2] & (1<<19)) != 0;
|
||||
f.have[CV_CPU_SSE4_2] = (cpuid_data[2] & (1<<20)) != 0;
|
||||
f.have[CV_CPU_POPCNT] = (cpuid_data[2] & (1<<23)) != 0;
|
||||
f.have[CV_CPU_AVX] = (((cpuid_data[2] & (1<<28)) != 0)&&((cpuid_data[2] & (1<<27)) != 0));//OS uses XSAVE_XRSTORE and CPU support AVX
|
||||
|
||||
// make the second call to the cpuid command in order to get
|
||||
// information about extended features like AVX2
|
||||
#if defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
|
||||
__cpuidex(cpuid_data, 7, 0);
|
||||
#elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
|
||||
#ifdef __x86_64__
|
||||
asm __volatile__
|
||||
(
|
||||
"movl $7, %%eax\n\t"
|
||||
"movl $0, %%ecx\n\t"
|
||||
"cpuid\n\t"
|
||||
:[eax]"=a"(cpuid_data[0]),[ebx]"=b"(cpuid_data[1]),[ecx]"=c"(cpuid_data[2]),[edx]"=d"(cpuid_data[3])
|
||||
:
|
||||
: "cc"
|
||||
);
|
||||
#else
|
||||
asm volatile
|
||||
(
|
||||
"pushl %%ebx\n\t"
|
||||
"movl $7,%%eax\n\t"
|
||||
"movl $0,%%ecx\n\t"
|
||||
"cpuid\n\t"
|
||||
"movl %%ebx, %0\n\t"
|
||||
"popl %%ebx\n\t"
|
||||
: "=r"(cpuid_data[1]), "=c"(cpuid_data[2])
|
||||
:
|
||||
: "cc"
|
||||
);
|
||||
#endif
|
||||
#endif
|
||||
f.have[CV_CPU_AVX2] = (cpuid_data[1] & (1<<5)) != 0;
|
||||
|
||||
f.have[CV_CPU_AVX_512F] = (cpuid_data[1] & (1<<16)) != 0;
|
||||
f.have[CV_CPU_AVX_512DQ] = (cpuid_data[1] & (1<<17)) != 0;
|
||||
f.have[CV_CPU_AVX_512IFMA512] = (cpuid_data[1] & (1<<21)) != 0;
|
||||
f.have[CV_CPU_AVX_512PF] = (cpuid_data[1] & (1<<26)) != 0;
|
||||
f.have[CV_CPU_AVX_512ER] = (cpuid_data[1] & (1<<27)) != 0;
|
||||
f.have[CV_CPU_AVX_512CD] = (cpuid_data[1] & (1<<28)) != 0;
|
||||
f.have[CV_CPU_AVX_512BW] = (cpuid_data[1] & (1<<30)) != 0;
|
||||
f.have[CV_CPU_AVX_512VL] = (cpuid_data[1] & (1<<31)) != 0;
|
||||
f.have[CV_CPU_AVX_512VBMI] = (cpuid_data[2] & (1<<1)) != 0;
|
||||
}
|
||||
|
||||
#if defined ANDROID || defined __linux__
|
||||
#ifdef __aarch64__
|
||||
f.have[CV_CPU_NEON] = true;
|
||||
#else
|
||||
int cpufile = open("/proc/self/auxv", O_RDONLY);
|
||||
|
||||
if (cpufile >= 0)
|
||||
{
|
||||
Elf32_auxv_t auxv;
|
||||
const size_t size_auxv_t = sizeof(auxv);
|
||||
|
||||
while ((size_t)read(cpufile, &auxv, size_auxv_t) == size_auxv_t)
|
||||
{
|
||||
if (auxv.a_type == AT_HWCAP)
|
||||
{
|
||||
f.have[CV_CPU_NEON] = (auxv.a_un.a_val & 4096) != 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
close(cpufile);
|
||||
}
|
||||
#endif
|
||||
#elif (defined __clang__ || defined __APPLE__) && (defined __ARM_NEON__ || (defined __ARM_NEON && defined __aarch64__))
|
||||
f.have[CV_CPU_NEON] = true;
|
||||
#endif
|
||||
|
||||
return f;
|
||||
}
|
||||
|
||||
int x86_family;
|
||||
bool have[MAX_FEATURE+1];
|
||||
};
|
||||
|
||||
static HWFeatures featuresEnabled = HWFeatures::initialize(), featuresDisabled = HWFeatures();
|
||||
static HWFeatures* currentFeatures = &featuresEnabled;
|
||||
|
||||
bool checkHardwareSupport(int feature)
|
||||
{
|
||||
CV_DbgAssert( 0 <= feature && feature <= CV_HARDWARE_MAX_FEATURE );
|
||||
return cv::hal::checkHardwareSupport(feature);
|
||||
return currentFeatures->have[feature];
|
||||
}
|
||||
|
||||
|
||||
volatile bool useOptimizedFlag = true;
|
||||
|
||||
void setUseOptimized( bool flag )
|
||||
{
|
||||
cv::hal::setUseOptimized(flag);
|
||||
useOptimizedFlag = flag;
|
||||
currentFeatures = flag ? &featuresEnabled : &featuresDisabled;
|
||||
|
||||
ipp::setUseIPP(flag);
|
||||
#ifdef HAVE_OPENCL
|
||||
@@ -219,7 +397,7 @@ void setUseOptimized( bool flag )
|
||||
|
||||
bool useOptimized(void)
|
||||
{
|
||||
return cv::hal::useOptimized();
|
||||
return useOptimizedFlag;
|
||||
}
|
||||
|
||||
int64 getTickCount(void)
|
||||
@@ -499,12 +677,12 @@ redirectError( CvErrorCallback errCallback, void* userdata, void** prevUserdata)
|
||||
CV_IMPL int cvCheckHardwareSupport(int feature)
|
||||
{
|
||||
CV_DbgAssert( 0 <= feature && feature <= CV_HARDWARE_MAX_FEATURE );
|
||||
return cv::hal::checkHardwareSupport(feature);
|
||||
return cv::currentFeatures->have[feature];
|
||||
}
|
||||
|
||||
CV_IMPL int cvUseOptimized( int flag )
|
||||
{
|
||||
int prevMode = cv::useOptimized();
|
||||
int prevMode = cv::useOptimizedFlag;
|
||||
cv::setUseOptimized( flag != 0 );
|
||||
return prevMode;
|
||||
}
|
||||
|
@@ -40,7 +40,6 @@
|
||||
//M*/
|
||||
|
||||
#include "test_precomp.hpp"
|
||||
#include "opencv2/hal.hpp"
|
||||
|
||||
using namespace cv;
|
||||
|
||||
@@ -72,21 +71,21 @@ TEST(Core_HAL, mathfuncs)
|
||||
{
|
||||
case HAL_EXP:
|
||||
if( depth == CV_32F )
|
||||
hal::exp(src.ptr<float>(), dst.ptr<float>(), n);
|
||||
hal::exp32f(src.ptr<float>(), dst.ptr<float>(), n);
|
||||
else
|
||||
hal::exp(src.ptr<double>(), dst.ptr<double>(), n);
|
||||
hal::exp64f(src.ptr<double>(), dst.ptr<double>(), n);
|
||||
break;
|
||||
case HAL_LOG:
|
||||
if( depth == CV_32F )
|
||||
hal::log(src.ptr<float>(), dst.ptr<float>(), n);
|
||||
hal::log32f(src.ptr<float>(), dst.ptr<float>(), n);
|
||||
else
|
||||
hal::log(src.ptr<double>(), dst.ptr<double>(), n);
|
||||
hal::log64f(src.ptr<double>(), dst.ptr<double>(), n);
|
||||
break;
|
||||
case HAL_SQRT:
|
||||
if( depth == CV_32F )
|
||||
hal::sqrt(src.ptr<float>(), dst.ptr<float>(), n);
|
||||
hal::sqrt32f(src.ptr<float>(), dst.ptr<float>(), n);
|
||||
else
|
||||
hal::sqrt(src.ptr<double>(), dst.ptr<double>(), n);
|
||||
hal::sqrt64f(src.ptr<double>(), dst.ptr<double>(), n);
|
||||
break;
|
||||
default:
|
||||
CV_Error(Error::StsBadArg, "unknown function");
|
||||
@@ -159,15 +158,15 @@ TEST(Core_HAL, mat_decomp)
|
||||
{
|
||||
case HAL_LU:
|
||||
if( depth == CV_32F )
|
||||
hal::LU(a.ptr<float>(), a.step, size, x.ptr<float>(), x.step, 1);
|
||||
hal::LU32f(a.ptr<float>(), a.step, size, x.ptr<float>(), x.step, 1);
|
||||
else
|
||||
hal::LU(a.ptr<double>(), a.step, size, x.ptr<double>(), x.step, 1);
|
||||
hal::LU64f(a.ptr<double>(), a.step, size, x.ptr<double>(), x.step, 1);
|
||||
break;
|
||||
case HAL_CHOL:
|
||||
if( depth == CV_32F )
|
||||
hal::Cholesky(a.ptr<float>(), a.step, size, x.ptr<float>(), x.step, 1);
|
||||
hal::Cholesky32f(a.ptr<float>(), a.step, size, x.ptr<float>(), x.step, 1);
|
||||
else
|
||||
hal::Cholesky(a.ptr<double>(), a.step, size, x.ptr<double>(), x.step, 1);
|
||||
hal::Cholesky64f(a.ptr<double>(), a.step, size, x.ptr<double>(), x.step, 1);
|
||||
break;
|
||||
default:
|
||||
CV_Error(Error::StsBadArg, "unknown function");
|
||||
|
864
modules/core/test/test_intrin.cpp
Normal file
864
modules/core/test/test_intrin.cpp
Normal file
@@ -0,0 +1,864 @@
|
||||
#include "test_intrin_utils.hpp"
|
||||
#include <climits>
|
||||
|
||||
using namespace cv;
|
||||
|
||||
template<typename R> struct TheTest
|
||||
{
|
||||
typedef typename R::lane_type LaneType;
|
||||
|
||||
TheTest & test_loadstore()
|
||||
{
|
||||
AlignedData<R> data;
|
||||
AlignedData<R> out;
|
||||
|
||||
// check if addresses are aligned and unaligned respectively
|
||||
EXPECT_EQ((size_t)0, (size_t)&data.a.d % 16);
|
||||
EXPECT_NE((size_t)0, (size_t)&data.u.d % 16);
|
||||
EXPECT_EQ((size_t)0, (size_t)&out.a.d % 16);
|
||||
EXPECT_NE((size_t)0, (size_t)&out.u.d % 16);
|
||||
|
||||
// check some initialization methods
|
||||
R r1 = data.a;
|
||||
R r2 = v_load(data.u.d);
|
||||
R r3 = v_load_aligned(data.a.d);
|
||||
R r4(r2);
|
||||
EXPECT_EQ(data.a[0], r1.get0());
|
||||
EXPECT_EQ(data.u[0], r2.get0());
|
||||
EXPECT_EQ(data.a[0], r3.get0());
|
||||
EXPECT_EQ(data.u[0], r4.get0());
|
||||
|
||||
// check some store methods
|
||||
out.u.clear();
|
||||
out.a.clear();
|
||||
v_store(out.u.d, r1);
|
||||
v_store_aligned(out.a.d, r2);
|
||||
EXPECT_EQ(data.a, out.a);
|
||||
EXPECT_EQ(data.u, out.u);
|
||||
|
||||
// check more store methods
|
||||
Data<R> d, res(0);
|
||||
R r5 = d;
|
||||
v_store_high(res.mid(), r5);
|
||||
v_store_low(res.d, r5);
|
||||
EXPECT_EQ(d, res);
|
||||
|
||||
// check halves load correctness
|
||||
res.clear();
|
||||
R r6 = v_load_halves(d.d, d.mid());
|
||||
v_store(res.d, r6);
|
||||
EXPECT_EQ(d, res);
|
||||
|
||||
// zero, all
|
||||
Data<R> resZ = RegTrait<R>::zero();
|
||||
Data<R> resV = RegTrait<R>::all(8);
|
||||
for (int i = 0; i < R::nlanes; ++i)
|
||||
{
|
||||
EXPECT_EQ((LaneType)0, resZ[i]);
|
||||
EXPECT_EQ((LaneType)8, resV[i]);
|
||||
}
|
||||
|
||||
// reinterpret_as
|
||||
v_uint8x16 vu8 = v_reinterpret_as_u8(r1); out.a.clear(); v_store((uchar*)out.a.d, vu8); EXPECT_EQ(data.a, out.a);
|
||||
v_int8x16 vs8 = v_reinterpret_as_s8(r1); out.a.clear(); v_store((schar*)out.a.d, vs8); EXPECT_EQ(data.a, out.a);
|
||||
v_uint16x8 vu16 = v_reinterpret_as_u16(r1); out.a.clear(); v_store((ushort*)out.a.d, vu16); EXPECT_EQ(data.a, out.a);
|
||||
v_int16x8 vs16 = v_reinterpret_as_s16(r1); out.a.clear(); v_store((short*)out.a.d, vs16); EXPECT_EQ(data.a, out.a);
|
||||
v_uint32x4 vu32 = v_reinterpret_as_u32(r1); out.a.clear(); v_store((unsigned*)out.a.d, vu32); EXPECT_EQ(data.a, out.a);
|
||||
v_int32x4 vs32 = v_reinterpret_as_s32(r1); out.a.clear(); v_store((int*)out.a.d, vs32); EXPECT_EQ(data.a, out.a);
|
||||
v_uint64x2 vu64 = v_reinterpret_as_u64(r1); out.a.clear(); v_store((uint64*)out.a.d, vu64); EXPECT_EQ(data.a, out.a);
|
||||
v_int64x2 vs64 = v_reinterpret_as_s64(r1); out.a.clear(); v_store((int64*)out.a.d, vs64); EXPECT_EQ(data.a, out.a);
|
||||
v_float32x4 vf32 = v_reinterpret_as_f32(r1); out.a.clear(); v_store((float*)out.a.d, vf32); EXPECT_EQ(data.a, out.a);
|
||||
#if CV_SIMD128_64F
|
||||
v_float64x2 vf64 = v_reinterpret_as_f64(r1); out.a.clear(); v_store((double*)out.a.d, vf64); EXPECT_EQ(data.a, out.a);
|
||||
#endif
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
TheTest & test_interleave()
|
||||
{
|
||||
Data<R> data1, data2, data3, data4;
|
||||
data2 += 20;
|
||||
data3 += 40;
|
||||
data4 += 60;
|
||||
|
||||
|
||||
R a = data1, b = data2, c = data3;
|
||||
R d = data1, e = data2, f = data3, g = data4;
|
||||
|
||||
LaneType buf3[R::nlanes * 3];
|
||||
LaneType buf4[R::nlanes * 4];
|
||||
|
||||
v_store_interleave(buf3, a, b, c);
|
||||
v_store_interleave(buf4, d, e, f, g);
|
||||
|
||||
Data<R> z(0);
|
||||
a = b = c = d = e = f = g = z;
|
||||
|
||||
v_load_deinterleave(buf3, a, b, c);
|
||||
v_load_deinterleave(buf4, d, e, f, g);
|
||||
|
||||
for (int i = 0; i < R::nlanes; ++i)
|
||||
{
|
||||
EXPECT_EQ(data1, Data<R>(a));
|
||||
EXPECT_EQ(data2, Data<R>(b));
|
||||
EXPECT_EQ(data3, Data<R>(c));
|
||||
|
||||
EXPECT_EQ(data1, Data<R>(d));
|
||||
EXPECT_EQ(data2, Data<R>(e));
|
||||
EXPECT_EQ(data3, Data<R>(f));
|
||||
EXPECT_EQ(data4, Data<R>(g));
|
||||
}
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
// v_expand and v_load_expand
|
||||
TheTest & test_expand()
|
||||
{
|
||||
typedef typename RegTrait<R>::w_reg Rx2;
|
||||
Data<R> dataA;
|
||||
R a = dataA;
|
||||
|
||||
Data<Rx2> resB = v_load_expand(dataA.d);
|
||||
|
||||
Rx2 c, d;
|
||||
v_expand(a, c, d);
|
||||
|
||||
Data<Rx2> resC = c, resD = d;
|
||||
const int n = Rx2::nlanes;
|
||||
for (int i = 0; i < n; ++i)
|
||||
{
|
||||
EXPECT_EQ(dataA[i], resB[i]);
|
||||
EXPECT_EQ(dataA[i], resC[i]);
|
||||
EXPECT_EQ(dataA[i + n], resD[i]);
|
||||
}
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
TheTest & test_expand_q()
|
||||
{
|
||||
typedef typename RegTrait<R>::q_reg Rx4;
|
||||
Data<R> data;
|
||||
Data<Rx4> out = v_load_expand_q(data.d);
|
||||
const int n = Rx4::nlanes;
|
||||
for (int i = 0; i < n; ++i)
|
||||
EXPECT_EQ(data[i], out[i]);
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
TheTest & test_addsub()
|
||||
{
|
||||
Data<R> dataA, dataB;
|
||||
dataB.reverse();
|
||||
R a = dataA, b = dataB;
|
||||
|
||||
Data<R> resC = a + b, resD = a - b;
|
||||
for (int i = 0; i < R::nlanes; ++i)
|
||||
{
|
||||
EXPECT_EQ(saturate_cast<LaneType>(dataA[i] + dataB[i]), resC[i]);
|
||||
EXPECT_EQ(saturate_cast<LaneType>(dataA[i] - dataB[i]), resD[i]);
|
||||
}
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
TheTest & test_addsub_wrap()
|
||||
{
|
||||
Data<R> dataA, dataB;
|
||||
dataB.reverse();
|
||||
R a = dataA, b = dataB;
|
||||
|
||||
Data<R> resC = v_add_wrap(a, b),
|
||||
resD = v_sub_wrap(a, b);
|
||||
for (int i = 0; i < R::nlanes; ++i)
|
||||
{
|
||||
EXPECT_EQ((LaneType)(dataA[i] + dataB[i]), resC[i]);
|
||||
EXPECT_EQ((LaneType)(dataA[i] - dataB[i]), resD[i]);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
TheTest & test_mul()
|
||||
{
|
||||
Data<R> dataA, dataB;
|
||||
dataB.reverse();
|
||||
R a = dataA, b = dataB;
|
||||
|
||||
Data<R> resC = a * b;
|
||||
for (int i = 0; i < R::nlanes; ++i)
|
||||
{
|
||||
EXPECT_EQ(dataA[i] * dataB[i], resC[i]);
|
||||
}
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
TheTest & test_div()
|
||||
{
|
||||
Data<R> dataA, dataB;
|
||||
dataB.reverse();
|
||||
R a = dataA, b = dataB;
|
||||
|
||||
Data<R> resC = a / b;
|
||||
for (int i = 0; i < R::nlanes; ++i)
|
||||
{
|
||||
EXPECT_EQ(dataA[i] / dataB[i], resC[i]);
|
||||
}
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
TheTest & test_mul_expand()
|
||||
{
|
||||
typedef typename RegTrait<R>::w_reg Rx2;
|
||||
Data<R> dataA, dataB(2);
|
||||
R a = dataA, b = dataB;
|
||||
Rx2 c, d;
|
||||
|
||||
v_mul_expand(a, b, c, d);
|
||||
|
||||
Data<Rx2> resC = c, resD = d;
|
||||
const int n = R::nlanes / 2;
|
||||
for (int i = 0; i < n; ++i)
|
||||
{
|
||||
EXPECT_EQ((typename Rx2::lane_type)dataA[i] * dataB[i], resC[i]);
|
||||
EXPECT_EQ((typename Rx2::lane_type)dataA[i + n] * dataB[i + n], resD[i]);
|
||||
}
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
template <int s>
|
||||
TheTest & test_shift()
|
||||
{
|
||||
Data<R> dataA;
|
||||
R a = dataA;
|
||||
|
||||
Data<R> resB = a << s, resC = v_shl<s>(a), resD = a >> s, resE = v_shr<s>(a);
|
||||
for (int i = 0; i < R::nlanes; ++i)
|
||||
{
|
||||
EXPECT_EQ(dataA[i] << s, resB[i]);
|
||||
EXPECT_EQ(dataA[i] << s, resC[i]);
|
||||
EXPECT_EQ(dataA[i] >> s, resD[i]);
|
||||
EXPECT_EQ(dataA[i] >> s, resE[i]);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
TheTest & test_cmp()
|
||||
{
|
||||
Data<R> dataA, dataB;
|
||||
dataB.reverse();
|
||||
dataB += 1;
|
||||
R a = dataA, b = dataB;
|
||||
|
||||
Data<R> resC = (a == b);
|
||||
Data<R> resD = (a != b);
|
||||
Data<R> resE = (a > b);
|
||||
Data<R> resF = (a >= b);
|
||||
Data<R> resG = (a < b);
|
||||
Data<R> resH = (a <= b);
|
||||
|
||||
for (int i = 0; i < R::nlanes; ++i)
|
||||
{
|
||||
EXPECT_EQ(dataA[i] == dataB[i], resC[i] != 0);
|
||||
EXPECT_EQ(dataA[i] != dataB[i], resD[i] != 0);
|
||||
EXPECT_EQ(dataA[i] > dataB[i], resE[i] != 0);
|
||||
EXPECT_EQ(dataA[i] >= dataB[i], resF[i] != 0);
|
||||
EXPECT_EQ(dataA[i] < dataB[i], resG[i] != 0);
|
||||
EXPECT_EQ(dataA[i] <= dataB[i], resH[i] != 0);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
TheTest & test_dot_prod()
|
||||
{
|
||||
typedef typename RegTrait<R>::w_reg Rx2;
|
||||
Data<R> dataA, dataB(2);
|
||||
R a = dataA, b = dataB;
|
||||
|
||||
Data<Rx2> res = v_dotprod(a, b);
|
||||
|
||||
const int n = R::nlanes / 2;
|
||||
for (int i = 0; i < n; ++i)
|
||||
{
|
||||
EXPECT_EQ(dataA[i*2] * dataB[i*2] + dataA[i*2 + 1] * dataB[i*2 + 1], res[i]);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
TheTest & test_logic()
|
||||
{
|
||||
Data<R> dataA, dataB(2);
|
||||
R a = dataA, b = dataB;
|
||||
|
||||
Data<R> resC = a & b, resD = a | b, resE = a ^ b, resF = ~a;
|
||||
for (int i = 0; i < R::nlanes; ++i)
|
||||
{
|
||||
EXPECT_EQ(dataA[i] & dataB[i], resC[i]);
|
||||
EXPECT_EQ(dataA[i] | dataB[i], resD[i]);
|
||||
EXPECT_EQ(dataA[i] ^ dataB[i], resE[i]);
|
||||
EXPECT_EQ((LaneType)~dataA[i], resF[i]);
|
||||
}
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
TheTest & test_sqrt_abs()
|
||||
{
|
||||
Data<R> dataA, dataD;
|
||||
dataD *= -1.0;
|
||||
R a = dataA, d = dataD;
|
||||
|
||||
Data<R> resB = v_sqrt(a), resC = v_invsqrt(a), resE = v_abs(d);
|
||||
for (int i = 0; i < R::nlanes; ++i)
|
||||
{
|
||||
EXPECT_FLOAT_EQ((float)std::sqrt(dataA[i]), (float)resB[i]);
|
||||
EXPECT_FLOAT_EQ(1/(float)std::sqrt(dataA[i]), (float)resC[i]);
|
||||
EXPECT_FLOAT_EQ((float)abs(dataA[i]), (float)resE[i]);
|
||||
}
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
TheTest & test_min_max()
|
||||
{
|
||||
Data<R> dataA, dataB;
|
||||
dataB.reverse();
|
||||
R a = dataA, b = dataB;
|
||||
|
||||
Data<R> resC = v_min(a, b), resD = v_max(a, b);
|
||||
for (int i = 0; i < R::nlanes; ++i)
|
||||
{
|
||||
EXPECT_EQ(std::min(dataA[i], dataB[i]), resC[i]);
|
||||
EXPECT_EQ(std::max(dataA[i], dataB[i]), resD[i]);
|
||||
}
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
TheTest & test_absdiff()
|
||||
{
|
||||
typedef typename RegTrait<R>::u_reg Ru;
|
||||
typedef typename Ru::lane_type u_type;
|
||||
Data<R> dataA(std::numeric_limits<LaneType>::max()),
|
||||
dataB(std::numeric_limits<LaneType>::min());
|
||||
dataA[0] = (LaneType)-1;
|
||||
dataB[0] = 1;
|
||||
dataA[1] = 2;
|
||||
dataB[1] = (LaneType)-2;
|
||||
R a = dataA, b = dataB;
|
||||
Data<Ru> resC = v_absdiff(a, b);
|
||||
const u_type mask = std::numeric_limits<LaneType>::is_signed ? (u_type)(1 << (sizeof(u_type)*8 - 1)) : 0;
|
||||
for (int i = 0; i < Ru::nlanes; ++i)
|
||||
{
|
||||
u_type uA = dataA[i] ^ mask;
|
||||
u_type uB = dataB[i] ^ mask;
|
||||
EXPECT_EQ(uA > uB ? uA - uB : uB - uA, resC[i]);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
TheTest & test_float_absdiff()
|
||||
{
|
||||
Data<R> dataA(std::numeric_limits<LaneType>::max()),
|
||||
dataB(std::numeric_limits<LaneType>::min());
|
||||
dataA[0] = -1;
|
||||
dataB[0] = 1;
|
||||
dataA[1] = 2;
|
||||
dataB[1] = -2;
|
||||
R a = dataA, b = dataB;
|
||||
Data<R> resC = v_absdiff(a, b);
|
||||
for (int i = 0; i < R::nlanes; ++i)
|
||||
{
|
||||
EXPECT_EQ(dataA[i] > dataB[i] ? dataA[i] - dataB[i] : dataB[i] - dataA[i], resC[i]);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
TheTest & test_reduce()
|
||||
{
|
||||
Data<R> dataA;
|
||||
R a = dataA;
|
||||
EXPECT_EQ((LaneType)1, v_reduce_min(a));
|
||||
EXPECT_EQ((LaneType)R::nlanes, v_reduce_max(a));
|
||||
EXPECT_EQ((LaneType)(1 + R::nlanes)*2, v_reduce_sum(a));
|
||||
return *this;
|
||||
}
|
||||
|
||||
TheTest & test_mask()
|
||||
{
|
||||
Data<R> dataA, dataB, dataC, dataD(1), dataE(2);
|
||||
dataA[1] *= (LaneType)-1;
|
||||
dataC *= (LaneType)-1;
|
||||
R a = dataA, b = dataB, c = dataC, d = dataD, e = dataE;
|
||||
|
||||
int m = v_signmask(a);
|
||||
EXPECT_EQ(2, m);
|
||||
|
||||
EXPECT_EQ(false, v_check_all(a));
|
||||
EXPECT_EQ(false, v_check_all(b));
|
||||
EXPECT_EQ(true, v_check_all(c));
|
||||
|
||||
EXPECT_EQ(true, v_check_any(a));
|
||||
EXPECT_EQ(false, v_check_any(b));
|
||||
EXPECT_EQ(true, v_check_any(c));
|
||||
|
||||
typedef V_TypeTraits<LaneType> Traits;
|
||||
typedef typename Traits::int_type int_type;
|
||||
|
||||
R f = v_select(b, d, e);
|
||||
Data<R> resF = f;
|
||||
for (int i = 0; i < R::nlanes; ++i)
|
||||
{
|
||||
int_type m2 = Traits::reinterpret_int(dataB[i]);
|
||||
EXPECT_EQ((Traits::reinterpret_int(dataD[i]) & m2)
|
||||
| (Traits::reinterpret_int(dataE[i]) & ~m2),
|
||||
Traits::reinterpret_int(resF[i]));
|
||||
}
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
template <int s>
|
||||
TheTest & test_pack()
|
||||
{
|
||||
typedef typename RegTrait<R>::w_reg Rx2;
|
||||
typedef typename Rx2::lane_type w_type;
|
||||
Data<Rx2> dataA, dataB;
|
||||
dataA += std::numeric_limits<LaneType>::is_signed ? -10 : 10;
|
||||
dataB *= 10;
|
||||
Rx2 a = dataA, b = dataB;
|
||||
|
||||
Data<R> resC = v_pack(a, b);
|
||||
Data<R> resD = v_rshr_pack<s>(a, b);
|
||||
|
||||
Data<R> resE(0);
|
||||
v_pack_store(resE.d, b);
|
||||
|
||||
Data<R> resF(0);
|
||||
v_rshr_pack_store<s>(resF.d, b);
|
||||
|
||||
const int n = Rx2::nlanes;
|
||||
const w_type add = (w_type)1 << (s - 1);
|
||||
for (int i = 0; i < n; ++i)
|
||||
{
|
||||
EXPECT_EQ(saturate_cast<LaneType>(dataA[i]), resC[i]);
|
||||
EXPECT_EQ(saturate_cast<LaneType>(dataB[i]), resC[i + n]);
|
||||
EXPECT_EQ(saturate_cast<LaneType>((dataA[i] + add) >> s), resD[i]);
|
||||
EXPECT_EQ(saturate_cast<LaneType>((dataB[i] + add) >> s), resD[i + n]);
|
||||
EXPECT_EQ(saturate_cast<LaneType>(dataB[i]), resE[i]);
|
||||
EXPECT_EQ((LaneType)0, resE[i + n]);
|
||||
EXPECT_EQ(saturate_cast<LaneType>((dataB[i] + add) >> s), resF[i]);
|
||||
EXPECT_EQ((LaneType)0, resF[i + n]);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
template <int s>
|
||||
TheTest & test_pack_u()
|
||||
{
|
||||
typedef typename RegTrait<R>::w_reg Rx2;
|
||||
typedef typename RegTrait<Rx2>::int_reg Ri2;
|
||||
typedef typename Ri2::lane_type w_type;
|
||||
|
||||
Data<Ri2> dataA, dataB;
|
||||
dataA += -10;
|
||||
dataB *= 10;
|
||||
Ri2 a = dataA, b = dataB;
|
||||
|
||||
Data<R> resC = v_pack_u(a, b);
|
||||
Data<R> resD = v_rshr_pack_u<s>(a, b);
|
||||
|
||||
Data<R> resE(0);
|
||||
v_pack_u_store(resE.d, b);
|
||||
|
||||
Data<R> resF(0);
|
||||
v_rshr_pack_u_store<s>(resF.d, b);
|
||||
|
||||
const int n = Ri2::nlanes;
|
||||
const w_type add = (w_type)1 << (s - 1);
|
||||
for (int i = 0; i < n; ++i)
|
||||
{
|
||||
EXPECT_EQ(saturate_cast<LaneType>(dataA[i]), resC[i]);
|
||||
EXPECT_EQ(saturate_cast<LaneType>(dataB[i]), resC[i + n]);
|
||||
EXPECT_EQ(saturate_cast<LaneType>((dataA[i] + add) >> s), resD[i]);
|
||||
EXPECT_EQ(saturate_cast<LaneType>((dataB[i] + add) >> s), resD[i + n]);
|
||||
EXPECT_EQ(saturate_cast<LaneType>(dataB[i]), resE[i]);
|
||||
EXPECT_EQ((LaneType)0, resE[i + n]);
|
||||
EXPECT_EQ(saturate_cast<LaneType>((dataB[i] + add) >> s), resF[i]);
|
||||
EXPECT_EQ((LaneType)0, resF[i + n]);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
TheTest & test_unpack()
|
||||
{
|
||||
Data<R> dataA, dataB;
|
||||
dataB *= 10;
|
||||
R a = dataA, b = dataB;
|
||||
|
||||
R c, d, e, f, lo, hi;
|
||||
v_zip(a, b, c, d);
|
||||
v_recombine(a, b, e, f);
|
||||
lo = v_combine_low(a, b);
|
||||
hi = v_combine_high(a, b);
|
||||
|
||||
Data<R> resC = c, resD = d, resE = e, resF = f, resLo = lo, resHi = hi;
|
||||
|
||||
const int n = R::nlanes/2;
|
||||
for (int i = 0; i < n; ++i)
|
||||
{
|
||||
EXPECT_EQ(dataA[i], resC[i*2]);
|
||||
EXPECT_EQ(dataB[i], resC[i*2+1]);
|
||||
EXPECT_EQ(dataA[i+n], resD[i*2]);
|
||||
EXPECT_EQ(dataB[i+n], resD[i*2+1]);
|
||||
|
||||
EXPECT_EQ(dataA[i], resE[i]);
|
||||
EXPECT_EQ(dataB[i], resE[i+n]);
|
||||
EXPECT_EQ(dataA[i+n], resF[i]);
|
||||
EXPECT_EQ(dataB[i+n], resF[i+n]);
|
||||
|
||||
EXPECT_EQ(dataA[i], resLo[i]);
|
||||
EXPECT_EQ(dataB[i], resLo[i+n]);
|
||||
EXPECT_EQ(dataA[i+n], resHi[i]);
|
||||
EXPECT_EQ(dataB[i+n], resHi[i+n]);
|
||||
}
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
template<int s>
|
||||
TheTest & test_extract()
|
||||
{
|
||||
Data<R> dataA, dataB;
|
||||
dataB *= 10;
|
||||
R a = dataA, b = dataB;
|
||||
|
||||
Data<R> resC = v_extract<s>(a, b);
|
||||
|
||||
for (int i = 0; i < R::nlanes; ++i)
|
||||
{
|
||||
if (i + s >= R::nlanes)
|
||||
EXPECT_EQ(dataB[i - R::nlanes + s], resC[i]);
|
||||
else
|
||||
EXPECT_EQ(dataA[i + s], resC[i]);
|
||||
}
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
TheTest & test_float_math()
|
||||
{
|
||||
typedef typename RegTrait<R>::int_reg Ri;
|
||||
Data<R> data1, data2, data3;
|
||||
data1 *= 1.1;
|
||||
data2 += 10;
|
||||
R a1 = data1, a2 = data2, a3 = data3;
|
||||
|
||||
Data<Ri> resB = v_round(a1),
|
||||
resC = v_trunc(a1),
|
||||
resD = v_floor(a1),
|
||||
resE = v_ceil(a1);
|
||||
|
||||
Data<R> resF = v_magnitude(a1, a2),
|
||||
resG = v_sqr_magnitude(a1, a2),
|
||||
resH = v_muladd(a1, a2, a3);
|
||||
|
||||
for (int i = 0; i < R::nlanes; ++i)
|
||||
{
|
||||
EXPECT_EQ(cvRound(data1[i]), resB[i]);
|
||||
EXPECT_EQ((typename Ri::lane_type)data1[i], resC[i]);
|
||||
EXPECT_EQ(cvFloor(data1[i]), resD[i]);
|
||||
EXPECT_EQ(cvCeil(data1[i]), resE[i]);
|
||||
|
||||
EXPECT_DOUBLE_EQ(std::sqrt(data1[i]*data1[i] + data2[i]*data2[i]), resF[i]);
|
||||
EXPECT_DOUBLE_EQ(data1[i]*data1[i] + data2[i]*data2[i], resG[i]);
|
||||
EXPECT_DOUBLE_EQ(data1[i]*data2[i] + data3[i], resH[i]);
|
||||
}
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
TheTest & test_float_cvt32()
|
||||
{
|
||||
typedef v_float32x4 Rt;
|
||||
Data<R> dataA;
|
||||
dataA *= 1.1;
|
||||
R a = dataA;
|
||||
Rt b = v_cvt_f32(a);
|
||||
Data<Rt> resB = b;
|
||||
int n = std::min<int>(Rt::nlanes, R::nlanes);
|
||||
for (int i = 0; i < n; ++i)
|
||||
{
|
||||
EXPECT_EQ((typename Rt::lane_type)dataA[i], resB[i]);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
TheTest & test_float_cvt64()
|
||||
{
|
||||
#if CV_SIMD128_64F
|
||||
typedef v_float64x2 Rt;
|
||||
Data<R> dataA;
|
||||
dataA *= 1.1;
|
||||
R a = dataA;
|
||||
Rt b = v_cvt_f64(a);
|
||||
Data<Rt> resB = b;
|
||||
int n = std::min<int>(Rt::nlanes, R::nlanes);
|
||||
for (int i = 0; i < n; ++i)
|
||||
{
|
||||
EXPECT_EQ((typename Rt::lane_type)dataA[i], resB[i]);
|
||||
}
|
||||
#endif
|
||||
return *this;
|
||||
}
|
||||
|
||||
TheTest & test_matmul()
|
||||
{
|
||||
Data<R> dataV, dataA, dataB, dataC, dataD;
|
||||
dataB.reverse();
|
||||
dataC += 2;
|
||||
dataD *= 0.3;
|
||||
R v = dataV, a = dataA, b = dataB, c = dataC, d = dataD;
|
||||
|
||||
Data<R> res = v_matmul(v, a, b, c, d);
|
||||
for (int i = 0; i < R::nlanes; ++i)
|
||||
{
|
||||
LaneType val = dataV[0] * dataA[i]
|
||||
+ dataV[1] * dataB[i]
|
||||
+ dataV[2] * dataC[i]
|
||||
+ dataV[3] * dataD[i];
|
||||
EXPECT_DOUBLE_EQ(val, res[i]);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
TheTest & test_transpose()
|
||||
{
|
||||
Data<R> dataA, dataB, dataC, dataD;
|
||||
dataB *= 5;
|
||||
dataC *= 10;
|
||||
dataD *= 15;
|
||||
R a = dataA, b = dataB, c = dataC, d = dataD;
|
||||
R e, f, g, h;
|
||||
v_transpose4x4(a, b, c, d,
|
||||
e, f, g, h);
|
||||
|
||||
Data<R> res[4] = {e, f, g, h};
|
||||
for (int i = 0; i < R::nlanes; ++i)
|
||||
{
|
||||
EXPECT_EQ(dataA[i], res[i][0]);
|
||||
EXPECT_EQ(dataB[i], res[i][1]);
|
||||
EXPECT_EQ(dataC[i], res[i][2]);
|
||||
EXPECT_EQ(dataD[i], res[i][3]);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
|
||||
//============= 8-bit integer =====================================================================
|
||||
|
||||
TEST(hal_intrin, uint8x16) {
|
||||
TheTest<v_uint8x16>()
|
||||
.test_loadstore()
|
||||
.test_interleave()
|
||||
.test_expand()
|
||||
.test_expand_q()
|
||||
.test_addsub()
|
||||
.test_addsub_wrap()
|
||||
.test_cmp()
|
||||
.test_logic()
|
||||
.test_min_max()
|
||||
.test_absdiff()
|
||||
.test_mask()
|
||||
.test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>()
|
||||
.test_pack_u<1>().test_pack_u<2>().test_pack_u<3>().test_pack_u<8>()
|
||||
.test_unpack()
|
||||
.test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>()
|
||||
;
|
||||
}
|
||||
|
||||
TEST(hal_intrin, int8x16) {
|
||||
TheTest<v_int8x16>()
|
||||
.test_loadstore()
|
||||
.test_interleave()
|
||||
.test_expand()
|
||||
.test_expand_q()
|
||||
.test_addsub()
|
||||
.test_addsub_wrap()
|
||||
.test_cmp()
|
||||
.test_logic()
|
||||
.test_min_max()
|
||||
.test_absdiff()
|
||||
.test_mask()
|
||||
.test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>()
|
||||
.test_unpack()
|
||||
.test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>()
|
||||
;
|
||||
}
|
||||
|
||||
//============= 16-bit integer =====================================================================
|
||||
|
||||
TEST(hal_intrin, uint16x8) {
|
||||
TheTest<v_uint16x8>()
|
||||
.test_loadstore()
|
||||
.test_interleave()
|
||||
.test_expand()
|
||||
.test_addsub()
|
||||
.test_addsub_wrap()
|
||||
.test_mul()
|
||||
.test_mul_expand()
|
||||
.test_cmp()
|
||||
.test_shift<1>()
|
||||
.test_shift<8>()
|
||||
.test_logic()
|
||||
.test_min_max()
|
||||
.test_absdiff()
|
||||
.test_mask()
|
||||
.test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>()
|
||||
.test_pack_u<1>().test_pack_u<2>().test_pack_u<7>().test_pack_u<16>()
|
||||
.test_unpack()
|
||||
.test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>()
|
||||
;
|
||||
}
|
||||
|
||||
TEST(hal_intrin, int16x8) {
|
||||
TheTest<v_int16x8>()
|
||||
.test_loadstore()
|
||||
.test_interleave()
|
||||
.test_expand()
|
||||
.test_addsub()
|
||||
.test_addsub_wrap()
|
||||
.test_mul()
|
||||
.test_mul_expand()
|
||||
.test_cmp()
|
||||
.test_shift<1>()
|
||||
.test_shift<8>()
|
||||
.test_dot_prod()
|
||||
.test_logic()
|
||||
.test_min_max()
|
||||
.test_absdiff()
|
||||
.test_mask()
|
||||
.test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>()
|
||||
.test_unpack()
|
||||
.test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>()
|
||||
;
|
||||
}
|
||||
|
||||
//============= 32-bit integer =====================================================================
|
||||
|
||||
TEST(hal_intrin, uint32x4) {
|
||||
TheTest<v_uint32x4>()
|
||||
.test_loadstore()
|
||||
.test_interleave()
|
||||
.test_expand()
|
||||
.test_addsub()
|
||||
.test_mul()
|
||||
.test_mul_expand()
|
||||
.test_cmp()
|
||||
.test_shift<1>()
|
||||
.test_shift<8>()
|
||||
.test_logic()
|
||||
.test_min_max()
|
||||
.test_absdiff()
|
||||
.test_reduce()
|
||||
.test_mask()
|
||||
.test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>()
|
||||
.test_unpack()
|
||||
.test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
|
||||
.test_transpose()
|
||||
;
|
||||
}
|
||||
|
||||
TEST(hal_intrin, int32x4) {
|
||||
TheTest<v_int32x4>()
|
||||
.test_loadstore()
|
||||
.test_interleave()
|
||||
.test_expand()
|
||||
.test_addsub()
|
||||
.test_mul()
|
||||
.test_cmp()
|
||||
.test_shift<1>().test_shift<8>()
|
||||
.test_logic()
|
||||
.test_min_max()
|
||||
.test_absdiff()
|
||||
.test_reduce()
|
||||
.test_mask()
|
||||
.test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>()
|
||||
.test_unpack()
|
||||
.test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
|
||||
.test_float_cvt32()
|
||||
.test_float_cvt64()
|
||||
.test_transpose()
|
||||
;
|
||||
}
|
||||
|
||||
//============= 64-bit integer =====================================================================
|
||||
|
||||
TEST(hal_intrin, uint64x2) {
|
||||
TheTest<v_uint64x2>()
|
||||
.test_loadstore()
|
||||
.test_addsub()
|
||||
.test_shift<1>().test_shift<8>()
|
||||
.test_logic()
|
||||
.test_extract<0>().test_extract<1>()
|
||||
;
|
||||
}
|
||||
|
||||
TEST(hal_intrin, int64x2) {
|
||||
TheTest<v_int64x2>()
|
||||
.test_loadstore()
|
||||
.test_addsub()
|
||||
.test_shift<1>().test_shift<8>()
|
||||
.test_logic()
|
||||
.test_extract<0>().test_extract<1>()
|
||||
;
|
||||
}
|
||||
|
||||
//============= Floating point =====================================================================
|
||||
|
||||
TEST(hal_intrin, float32x4) {
|
||||
TheTest<v_float32x4>()
|
||||
.test_loadstore()
|
||||
.test_interleave()
|
||||
.test_addsub()
|
||||
.test_mul()
|
||||
.test_div()
|
||||
.test_cmp()
|
||||
.test_sqrt_abs()
|
||||
.test_min_max()
|
||||
.test_float_absdiff()
|
||||
.test_reduce()
|
||||
.test_mask()
|
||||
.test_unpack()
|
||||
.test_float_math()
|
||||
.test_float_cvt64()
|
||||
.test_matmul()
|
||||
.test_transpose()
|
||||
;
|
||||
}
|
||||
|
||||
#if CV_SIMD128_64F
|
||||
TEST(hal_intrin, float64x2) {
|
||||
TheTest<v_float64x2>()
|
||||
.test_loadstore()
|
||||
.test_addsub()
|
||||
.test_mul()
|
||||
.test_div()
|
||||
.test_cmp()
|
||||
.test_sqrt_abs()
|
||||
.test_min_max()
|
||||
.test_float_absdiff()
|
||||
.test_mask()
|
||||
.test_unpack()
|
||||
.test_float_math()
|
||||
.test_float_cvt32()
|
||||
;
|
||||
}
|
||||
#endif
|
234
modules/core/test/test_intrin_utils.hpp
Normal file
234
modules/core/test/test_intrin_utils.hpp
Normal file
@@ -0,0 +1,234 @@
|
||||
#ifndef _TEST_UTILS_HPP_
|
||||
#define _TEST_UTILS_HPP_
|
||||
|
||||
#include "opencv2/core/hal/intrin.hpp"
|
||||
#include "opencv2/ts.hpp"
|
||||
#include <ostream>
|
||||
#include <algorithm>
|
||||
|
||||
template <typename R> struct Data;
|
||||
template <int N> struct initializer;
|
||||
|
||||
template <> struct initializer<16>
|
||||
{
|
||||
template <typename R> static R init(const Data<R> & d)
|
||||
{
|
||||
return R(d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7], d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15]);
|
||||
}
|
||||
};
|
||||
|
||||
template <> struct initializer<8>
|
||||
{
|
||||
template <typename R> static R init(const Data<R> & d)
|
||||
{
|
||||
return R(d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7]);
|
||||
}
|
||||
};
|
||||
|
||||
template <> struct initializer<4>
|
||||
{
|
||||
template <typename R> static R init(const Data<R> & d)
|
||||
{
|
||||
return R(d[0], d[1], d[2], d[3]);
|
||||
}
|
||||
};
|
||||
|
||||
template <> struct initializer<2>
|
||||
{
|
||||
template <typename R> static R init(const Data<R> & d)
|
||||
{
|
||||
return R(d[0], d[1]);
|
||||
}
|
||||
};
|
||||
|
||||
//==================================================================================================
|
||||
|
||||
template <typename R> struct Data
|
||||
{
|
||||
typedef typename R::lane_type LaneType;
|
||||
Data()
|
||||
{
|
||||
for (int i = 0; i < R::nlanes; ++i)
|
||||
d[i] = (LaneType)(i + 1);
|
||||
}
|
||||
Data(LaneType val)
|
||||
{
|
||||
fill(val);
|
||||
}
|
||||
Data(const R & r)
|
||||
{
|
||||
*this = r;
|
||||
}
|
||||
operator R ()
|
||||
{
|
||||
return initializer<R::nlanes>().init(*this);
|
||||
}
|
||||
Data<R> & operator=(const R & r)
|
||||
{
|
||||
v_store(d, r);
|
||||
return *this;
|
||||
}
|
||||
template <typename T> Data<R> & operator*=(T m)
|
||||
{
|
||||
for (int i = 0; i < R::nlanes; ++i)
|
||||
d[i] *= (LaneType)m;
|
||||
return *this;
|
||||
}
|
||||
template <typename T> Data<R> & operator+=(T m)
|
||||
{
|
||||
for (int i = 0; i < R::nlanes; ++i)
|
||||
d[i] += (LaneType)m;
|
||||
return *this;
|
||||
}
|
||||
void fill(LaneType val)
|
||||
{
|
||||
for (int i = 0; i < R::nlanes; ++i)
|
||||
d[i] = val;
|
||||
}
|
||||
void reverse()
|
||||
{
|
||||
for (int i = 0; i < R::nlanes / 2; ++i)
|
||||
std::swap(d[i], d[R::nlanes - i - 1]);
|
||||
}
|
||||
const LaneType & operator[](int i) const
|
||||
{
|
||||
CV_Assert(i >= 0 && i < R::nlanes);
|
||||
return d[i];
|
||||
}
|
||||
LaneType & operator[](int i)
|
||||
{
|
||||
CV_Assert(i >= 0 && i < R::nlanes);
|
||||
return d[i];
|
||||
}
|
||||
const LaneType * mid() const
|
||||
{
|
||||
return d + R::nlanes / 2;
|
||||
}
|
||||
LaneType * mid()
|
||||
{
|
||||
return d + R::nlanes / 2;
|
||||
}
|
||||
bool operator==(const Data<R> & other) const
|
||||
{
|
||||
for (int i = 0; i < R::nlanes; ++i)
|
||||
if (d[i] != other.d[i])
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
void clear()
|
||||
{
|
||||
fill(0);
|
||||
}
|
||||
bool isZero() const
|
||||
{
|
||||
return isValue(0);
|
||||
}
|
||||
bool isValue(uchar val) const
|
||||
{
|
||||
for (int i = 0; i < R::nlanes; ++i)
|
||||
if (d[i] != val)
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
LaneType d[R::nlanes];
|
||||
};
|
||||
|
||||
template<typename R> struct AlignedData
|
||||
{
|
||||
Data<R> CV_DECL_ALIGNED(16) a; // aligned
|
||||
char dummy;
|
||||
Data<R> u; // unaligned
|
||||
};
|
||||
|
||||
template <typename R> std::ostream & operator<<(std::ostream & out, const Data<R> & d)
|
||||
{
|
||||
out << "{ ";
|
||||
for (int i = 0; i < R::nlanes; ++i)
|
||||
{
|
||||
// out << std::hex << +V_TypeTraits<typename R::lane_type>::reinterpret_int(d.d[i]);
|
||||
out << +d.d[i];
|
||||
if (i + 1 < R::nlanes)
|
||||
out << ", ";
|
||||
}
|
||||
out << " }";
|
||||
return out;
|
||||
}
|
||||
|
||||
//==================================================================================================
|
||||
|
||||
template <typename R> struct RegTrait;
|
||||
|
||||
template <> struct RegTrait<cv::v_uint8x16> {
|
||||
typedef cv::v_uint16x8 w_reg;
|
||||
typedef cv::v_uint32x4 q_reg;
|
||||
typedef cv::v_uint8x16 u_reg;
|
||||
static cv::v_uint8x16 zero() { return cv::v_setzero_u8(); }
|
||||
static cv::v_uint8x16 all(uchar val) { return cv::v_setall_u8(val); }
|
||||
};
|
||||
template <> struct RegTrait<cv::v_int8x16> {
|
||||
typedef cv::v_int16x8 w_reg;
|
||||
typedef cv::v_int32x4 q_reg;
|
||||
typedef cv::v_uint8x16 u_reg;
|
||||
static cv::v_int8x16 zero() { return cv::v_setzero_s8(); }
|
||||
static cv::v_int8x16 all(schar val) { return cv::v_setall_s8(val); }
|
||||
};
|
||||
|
||||
template <> struct RegTrait<cv::v_uint16x8> {
|
||||
typedef cv::v_uint32x4 w_reg;
|
||||
typedef cv::v_int16x8 int_reg;
|
||||
typedef cv::v_uint16x8 u_reg;
|
||||
static cv::v_uint16x8 zero() { return cv::v_setzero_u16(); }
|
||||
static cv::v_uint16x8 all(ushort val) { return cv::v_setall_u16(val); }
|
||||
};
|
||||
|
||||
template <> struct RegTrait<cv::v_int16x8> {
|
||||
typedef cv::v_int32x4 w_reg;
|
||||
typedef cv::v_uint16x8 u_reg;
|
||||
static cv::v_int16x8 zero() { return cv::v_setzero_s16(); }
|
||||
static cv::v_int16x8 all(short val) { return cv::v_setall_s16(val); }
|
||||
};
|
||||
|
||||
template <> struct RegTrait<cv::v_uint32x4> {
|
||||
typedef cv::v_uint64x2 w_reg;
|
||||
typedef cv::v_int32x4 int_reg;
|
||||
typedef cv::v_uint32x4 u_reg;
|
||||
static cv::v_uint32x4 zero() { return cv::v_setzero_u32(); }
|
||||
static cv::v_uint32x4 all(unsigned val) { return cv::v_setall_u32(val); }
|
||||
};
|
||||
|
||||
template <> struct RegTrait<cv::v_int32x4> {
|
||||
typedef cv::v_int64x2 w_reg;
|
||||
typedef cv::v_uint32x4 u_reg;
|
||||
static cv::v_int32x4 zero() { return cv::v_setzero_s32(); }
|
||||
static cv::v_int32x4 all(int val) { return cv::v_setall_s32(val); }
|
||||
};
|
||||
|
||||
template <> struct RegTrait<cv::v_uint64x2> {
|
||||
static cv::v_uint64x2 zero() { return cv::v_setzero_u64(); }
|
||||
static cv::v_uint64x2 all(uint64 val) { return cv::v_setall_u64(val); }
|
||||
};
|
||||
|
||||
template <> struct RegTrait<cv::v_int64x2> {
|
||||
static cv::v_int64x2 zero() { return cv::v_setzero_s64(); }
|
||||
static cv::v_int64x2 all(int64 val) { return cv::v_setall_s64(val); }
|
||||
};
|
||||
|
||||
template <> struct RegTrait<cv::v_float32x4> {
|
||||
typedef cv::v_int32x4 int_reg;
|
||||
typedef cv::v_float32x4 u_reg;
|
||||
static cv::v_float32x4 zero() { return cv::v_setzero_f32(); }
|
||||
static cv::v_float32x4 all(float val) { return cv::v_setall_f32(val); }
|
||||
};
|
||||
|
||||
#if CV_SIMD128_64F
|
||||
template <> struct RegTrait<cv::v_float64x2> {
|
||||
typedef cv::v_int32x4 int_reg;
|
||||
typedef cv::v_float64x2 u_reg;
|
||||
static cv::v_float64x2 zero() { return cv::v_setzero_f64(); }
|
||||
static cv::v_float64x2 all(double val) { return cv::v_setall_f64(val); }
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
@@ -13,6 +13,9 @@
|
||||
#include "opencv2/ts.hpp"
|
||||
#include "opencv2/core/core_c.h"
|
||||
|
||||
#include "opencv2/core/cvdef.h"
|
||||
#include "opencv2/core/private.hpp"
|
||||
#include "opencv2/core/hal/hal.hpp"
|
||||
#include "opencv2/core/hal/intrin.hpp"
|
||||
|
||||
#endif
|
||||
|
Reference in New Issue
Block a user