HAL: improvements
- added new functions from core module: split, merge, add, sub, mul, div, ... - added function replacement mechanism - added example of HAL replacement library
This commit is contained in:
@@ -46,6 +46,7 @@
|
||||
#define __OPENCV_HAL_HPP__
|
||||
|
||||
#include "opencv2/hal/defs.h"
|
||||
#include "opencv2/hal/interface.hpp"
|
||||
|
||||
/**
|
||||
@defgroup hal Hardware Acceleration Layer
|
||||
@@ -58,22 +59,19 @@
|
||||
@}
|
||||
*/
|
||||
|
||||
|
||||
namespace cv { namespace hal {
|
||||
|
||||
//! @addtogroup hal
|
||||
//! @{
|
||||
|
||||
namespace Error {
|
||||
|
||||
enum
|
||||
class Failure
|
||||
{
|
||||
Ok = 0,
|
||||
Unknown = -1
|
||||
public:
|
||||
Failure(int code_ = Error::Unknown) : code(code_) {}
|
||||
public:
|
||||
int code;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
int normHamming(const uchar* a, int n);
|
||||
int normHamming(const uchar* a, const uchar* b, int n);
|
||||
|
||||
@@ -104,8 +102,194 @@ void sqrt(const double* src, double* dst, int len);
|
||||
void invSqrt(const float* src, float* dst, int len);
|
||||
void invSqrt(const double* src, double* dst, int len);
|
||||
|
||||
void split8u(const uchar* src, uchar** dst, int len, int cn );
|
||||
void split16u(const ushort* src, ushort** dst, int len, int cn );
|
||||
void split32s(const int* src, int** dst, int len, int cn );
|
||||
void split64s(const int64* src, int64** dst, int len, int cn );
|
||||
|
||||
void merge8u(const uchar** src, uchar* dst, int len, int cn );
|
||||
void merge16u(const ushort** src, ushort* dst, int len, int cn );
|
||||
void merge32s(const int** src, int* dst, int len, int cn );
|
||||
void merge64s(const int64** src, int64* dst, int len, int cn );
|
||||
|
||||
void add8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
|
||||
void add8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* );
|
||||
void add16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* );
|
||||
void add16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* );
|
||||
void add32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* );
|
||||
void add32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* );
|
||||
void add64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* );
|
||||
|
||||
void sub8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
|
||||
void sub8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* );
|
||||
void sub16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* );
|
||||
void sub16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* );
|
||||
void sub32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* );
|
||||
void sub32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* );
|
||||
void sub64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* );
|
||||
|
||||
void max8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
|
||||
void max8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* );
|
||||
void max16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* );
|
||||
void max16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* );
|
||||
void max32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* );
|
||||
void max32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* );
|
||||
void max64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* );
|
||||
|
||||
void min8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
|
||||
void min8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* );
|
||||
void min16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* );
|
||||
void min16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* );
|
||||
void min32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* );
|
||||
void min32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* );
|
||||
void min64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* );
|
||||
|
||||
void absdiff8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
|
||||
void absdiff8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* );
|
||||
void absdiff16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* );
|
||||
void absdiff16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* );
|
||||
void absdiff32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* );
|
||||
void absdiff32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* );
|
||||
void absdiff64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* );
|
||||
|
||||
void and8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
|
||||
void or8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
|
||||
void xor8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
|
||||
void not8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
|
||||
|
||||
void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
|
||||
void cmp8s(const schar* src1, size_t step1, const schar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
|
||||
void cmp16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
|
||||
void cmp16s(const short* src1, size_t step1, const short* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
|
||||
void cmp32s(const int* src1, size_t step1, const int* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
|
||||
void cmp32f(const float* src1, size_t step1, const float* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
|
||||
void cmp64f(const double* src1, size_t step1, const double* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
|
||||
|
||||
void mul8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* scale);
|
||||
void mul8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* scale);
|
||||
void mul16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* scale);
|
||||
void mul16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* scale);
|
||||
void mul32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* scale);
|
||||
void mul32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scale);
|
||||
void mul64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scale);
|
||||
|
||||
void div8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* scale);
|
||||
void div8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* scale);
|
||||
void div16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* scale);
|
||||
void div16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* scale);
|
||||
void div32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* scale);
|
||||
void div32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scale);
|
||||
void div64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scale);
|
||||
|
||||
void recip8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* scale);
|
||||
void recip8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* scale);
|
||||
void recip16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* scale);
|
||||
void recip16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* scale);
|
||||
void recip32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* scale);
|
||||
void recip32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scale);
|
||||
void recip64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scale);
|
||||
|
||||
void addWeighted8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _scalars );
|
||||
void addWeighted8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* scalars );
|
||||
void addWeighted16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* scalars );
|
||||
void addWeighted16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* scalars );
|
||||
void addWeighted32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* scalars );
|
||||
void addWeighted32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scalars );
|
||||
void addWeighted64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scalars );
|
||||
//! @}
|
||||
|
||||
}} //cv::hal
|
||||
|
||||
namespace cv {
|
||||
|
||||
template<typename T1, typename T2=T1, typename T3=T1> struct OpAdd
|
||||
{
|
||||
typedef T1 type1;
|
||||
typedef T2 type2;
|
||||
typedef T3 rtype;
|
||||
T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(a + b); }
|
||||
};
|
||||
|
||||
template<typename T1, typename T2=T1, typename T3=T1> struct OpSub
|
||||
{
|
||||
typedef T1 type1;
|
||||
typedef T2 type2;
|
||||
typedef T3 rtype;
|
||||
T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(a - b); }
|
||||
};
|
||||
|
||||
template<typename T1, typename T2=T1, typename T3=T1> struct OpRSub
|
||||
{
|
||||
typedef T1 type1;
|
||||
typedef T2 type2;
|
||||
typedef T3 rtype;
|
||||
T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(b - a); }
|
||||
};
|
||||
|
||||
template<typename T> struct OpMin
|
||||
{
|
||||
typedef T type1;
|
||||
typedef T type2;
|
||||
typedef T rtype;
|
||||
T operator ()(const T a, const T b) const { return std::min(a, b); }
|
||||
};
|
||||
|
||||
template<typename T> struct OpMax
|
||||
{
|
||||
typedef T type1;
|
||||
typedef T type2;
|
||||
typedef T rtype;
|
||||
T operator ()(const T a, const T b) const { return std::max(a, b); }
|
||||
};
|
||||
|
||||
template<typename T> struct OpAbsDiff
|
||||
{
|
||||
typedef T type1;
|
||||
typedef T type2;
|
||||
typedef T rtype;
|
||||
T operator()(T a, T b) const { return (T)std::abs(a - b); }
|
||||
};
|
||||
|
||||
template<typename T, typename WT=T> struct OpAbsDiffS
|
||||
{
|
||||
typedef T type1;
|
||||
typedef WT type2;
|
||||
typedef T rtype;
|
||||
T operator()(T a, WT b) const { return saturate_cast<T>(std::abs(a - b)); }
|
||||
};
|
||||
|
||||
template<typename T> struct OpAnd
|
||||
{
|
||||
typedef T type1;
|
||||
typedef T type2;
|
||||
typedef T rtype;
|
||||
T operator()( T a, T b ) const { return a & b; }
|
||||
};
|
||||
|
||||
template<typename T> struct OpOr
|
||||
{
|
||||
typedef T type1;
|
||||
typedef T type2;
|
||||
typedef T rtype;
|
||||
T operator()( T a, T b ) const { return a | b; }
|
||||
};
|
||||
|
||||
template<typename T> struct OpXor
|
||||
{
|
||||
typedef T type1;
|
||||
typedef T type2;
|
||||
typedef T rtype;
|
||||
T operator()( T a, T b ) const { return a ^ b; }
|
||||
};
|
||||
|
||||
template<typename T> struct OpNot
|
||||
{
|
||||
typedef T type1;
|
||||
typedef T type2;
|
||||
typedef T rtype;
|
||||
T operator()( T a, T ) const { return ~a; }
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif //__OPENCV_HAL_HPP__
|
||||
|
||||
@@ -53,6 +53,7 @@
|
||||
#endif
|
||||
|
||||
#include <limits.h>
|
||||
#include "opencv2/hal/interface.hpp"
|
||||
|
||||
#if defined __ICL
|
||||
# define CV_ICC __ICL
|
||||
@@ -117,9 +118,38 @@
|
||||
|
||||
#define CV_CPU_NEON 100
|
||||
|
||||
// when adding to this list remember to update the enum in core/utility.cpp
|
||||
// when adding to this list remember to update the following enum
|
||||
#define CV_HARDWARE_MAX_FEATURE 255
|
||||
|
||||
/** @brief Available CPU features.
|
||||
*/
|
||||
enum CpuFeatures {
|
||||
CPU_MMX = 1,
|
||||
CPU_SSE = 2,
|
||||
CPU_SSE2 = 3,
|
||||
CPU_SSE3 = 4,
|
||||
CPU_SSSE3 = 5,
|
||||
CPU_SSE4_1 = 6,
|
||||
CPU_SSE4_2 = 7,
|
||||
CPU_POPCNT = 8,
|
||||
|
||||
CPU_AVX = 10,
|
||||
CPU_AVX2 = 11,
|
||||
CPU_FMA3 = 12,
|
||||
|
||||
CPU_AVX_512F = 13,
|
||||
CPU_AVX_512BW = 14,
|
||||
CPU_AVX_512CD = 15,
|
||||
CPU_AVX_512DQ = 16,
|
||||
CPU_AVX_512ER = 17,
|
||||
CPU_AVX_512IFMA512 = 18,
|
||||
CPU_AVX_512PF = 19,
|
||||
CPU_AVX_512VBMI = 20,
|
||||
CPU_AVX_512VL = 21,
|
||||
|
||||
CPU_NEON = 100
|
||||
};
|
||||
|
||||
// do not include SSE/AVX/NEON headers for NVCC compiler
|
||||
#ifndef __CUDACC__
|
||||
|
||||
@@ -257,49 +287,6 @@
|
||||
# define CV_VFP 0
|
||||
#endif
|
||||
|
||||
/* primitive types */
|
||||
/*
|
||||
schar - signed 1 byte integer
|
||||
uchar - unsigned 1 byte integer
|
||||
short - signed 2 byte integer
|
||||
ushort - unsigned 2 byte integer
|
||||
int - signed 4 byte integer
|
||||
uint - unsigned 4 byte integer
|
||||
int64 - signed 8 byte integer
|
||||
uint64 - unsigned 8 byte integer
|
||||
*/
|
||||
|
||||
#if !defined _MSC_VER && !defined __BORLANDC__
|
||||
# if defined __cplusplus && __cplusplus >= 201103L && !defined __APPLE__
|
||||
# include <cstdint>
|
||||
typedef std::uint32_t uint;
|
||||
# else
|
||||
# include <stdint.h>
|
||||
typedef uint32_t uint;
|
||||
# endif
|
||||
#else
|
||||
typedef unsigned uint;
|
||||
#endif
|
||||
|
||||
typedef signed char schar;
|
||||
|
||||
#ifndef __IPL_H__
|
||||
typedef unsigned char uchar;
|
||||
typedef unsigned short ushort;
|
||||
#endif
|
||||
|
||||
#if defined _MSC_VER || defined __BORLANDC__
|
||||
typedef __int64 int64;
|
||||
typedef unsigned __int64 uint64;
|
||||
# define CV_BIG_INT(n) n##I64
|
||||
# define CV_BIG_UINT(n) n##UI64
|
||||
#else
|
||||
typedef int64_t int64;
|
||||
typedef uint64_t uint64;
|
||||
# define CV_BIG_INT(n) n##LL
|
||||
# define CV_BIG_UINT(n) n##ULL
|
||||
#endif
|
||||
|
||||
/* fundamental constants */
|
||||
#define CV_PI 3.1415926535897932384626433832795
|
||||
#define CV_2PI 6.283185307179586476925286766559
|
||||
@@ -321,6 +308,19 @@ typedef union Cv64suf
|
||||
}
|
||||
Cv64suf;
|
||||
|
||||
namespace cv { namespace hal {
|
||||
|
||||
bool checkHardwareSupport(int feature);
|
||||
void setUseOptimized(bool onoff);
|
||||
bool useOptimized();
|
||||
|
||||
}}
|
||||
|
||||
#define USE_SSE2 (cv::hal::checkHardwareSupport(CV_CPU_SSE))
|
||||
#define USE_SSE4_2 (cv::hal::checkHardwareSupport(CV_CPU_SSE4_2))
|
||||
#define USE_AVX (cv::hal::checkHardwareSupport(CV_CPU_AVX))
|
||||
#define USE_AVX2 (cv::hal::checkHardwareSupport(CV_CPU_AVX2))
|
||||
|
||||
|
||||
/****************************************************************************************\
|
||||
* fast math *
|
||||
|
||||
91
modules/hal/include/opencv2/hal/interface.hpp
Normal file
91
modules/hal/include/opencv2/hal/interface.hpp
Normal file
@@ -0,0 +1,91 @@
|
||||
#ifndef _HAL_INTERFACE_HPP_INCLUDED_
|
||||
#define _HAL_INTERFACE_HPP_INCLUDED_
|
||||
|
||||
#define CV_HAL_ERROR_OK 0
|
||||
#define CV_HAL_ERROR_NI 1
|
||||
#define CV_HAL_ERROR_UNKNOWN -1
|
||||
|
||||
#define CV_HAL_CMP_EQ 0
|
||||
#define CV_HAL_CMP_GT 1
|
||||
#define CV_HAL_CMP_GE 2
|
||||
#define CV_HAL_CMP_LT 3
|
||||
#define CV_HAL_CMP_LE 4
|
||||
#define CV_HAL_CMP_NE 5
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace cv { namespace hal {
|
||||
|
||||
namespace Error {
|
||||
|
||||
enum
|
||||
{
|
||||
Ok = 0,
|
||||
NotImplemented = 1,
|
||||
Unknown = -1
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
enum
|
||||
{
|
||||
CMP_EQ = 0,
|
||||
CMP_GT = 1,
|
||||
CMP_GE = 2,
|
||||
CMP_LT = 3,
|
||||
CMP_LE = 4,
|
||||
CMP_NE = 5
|
||||
};
|
||||
|
||||
}}
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
#include <cstddef>
|
||||
#else
|
||||
#include <stddef.h>
|
||||
#endif
|
||||
|
||||
/* primitive types */
|
||||
/*
|
||||
schar - signed 1 byte integer
|
||||
uchar - unsigned 1 byte integer
|
||||
short - signed 2 byte integer
|
||||
ushort - unsigned 2 byte integer
|
||||
int - signed 4 byte integer
|
||||
uint - unsigned 4 byte integer
|
||||
int64 - signed 8 byte integer
|
||||
uint64 - unsigned 8 byte integer
|
||||
*/
|
||||
|
||||
#if !defined _MSC_VER && !defined __BORLANDC__
|
||||
# if defined __cplusplus && __cplusplus >= 201103L && !defined __APPLE__
|
||||
# include <cstdint>
|
||||
typedef std::uint32_t uint;
|
||||
# else
|
||||
# include <stdint.h>
|
||||
typedef uint32_t uint;
|
||||
# endif
|
||||
#else
|
||||
typedef unsigned uint;
|
||||
#endif
|
||||
|
||||
typedef signed char schar;
|
||||
|
||||
#ifndef __IPL_H__
|
||||
typedef unsigned char uchar;
|
||||
typedef unsigned short ushort;
|
||||
#endif
|
||||
|
||||
#if defined _MSC_VER || defined __BORLANDC__
|
||||
typedef __int64 int64;
|
||||
typedef unsigned __int64 uint64;
|
||||
# define CV_BIG_INT(n) n##I64
|
||||
# define CV_BIG_UINT(n) n##UI64
|
||||
#else
|
||||
typedef int64_t int64;
|
||||
typedef uint64_t uint64;
|
||||
# define CV_BIG_INT(n) n##LL
|
||||
# define CV_BIG_UINT(n) n##ULL
|
||||
#endif
|
||||
|
||||
#endif
|
||||
647
modules/hal/include/opencv2/hal/sse_utils.hpp
Normal file
647
modules/hal/include/opencv2/hal/sse_utils.hpp
Normal file
@@ -0,0 +1,647 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2015, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef __OPENCV_CORE_SSE_UTILS_HPP__
|
||||
#define __OPENCV_CORE_SSE_UTILS_HPP__
|
||||
|
||||
#ifndef __cplusplus
|
||||
# error sse_utils.hpp header must be compiled as C++
|
||||
#endif
|
||||
|
||||
#include "opencv2/hal/defs.h"
|
||||
|
||||
#if CV_SSE2
|
||||
|
||||
inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
|
||||
{
|
||||
__m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_g0);
|
||||
__m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_g0);
|
||||
__m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_g1);
|
||||
__m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_g1);
|
||||
|
||||
__m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk2);
|
||||
__m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk2);
|
||||
__m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk3);
|
||||
__m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk3);
|
||||
|
||||
__m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk2);
|
||||
__m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk2);
|
||||
__m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk3);
|
||||
__m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk3);
|
||||
|
||||
__m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk2);
|
||||
__m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk2);
|
||||
__m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk3);
|
||||
__m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk3);
|
||||
|
||||
v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk2);
|
||||
v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk2);
|
||||
v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk3);
|
||||
v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk3);
|
||||
}
|
||||
|
||||
inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
|
||||
__m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
|
||||
{
|
||||
__m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_g1);
|
||||
__m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_g1);
|
||||
__m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_b0);
|
||||
__m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_b0);
|
||||
__m128i layer1_chunk4 = _mm_unpacklo_epi8(v_g0, v_b1);
|
||||
__m128i layer1_chunk5 = _mm_unpackhi_epi8(v_g0, v_b1);
|
||||
|
||||
__m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk3);
|
||||
__m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk3);
|
||||
__m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk4);
|
||||
__m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk4);
|
||||
__m128i layer2_chunk4 = _mm_unpacklo_epi8(layer1_chunk2, layer1_chunk5);
|
||||
__m128i layer2_chunk5 = _mm_unpackhi_epi8(layer1_chunk2, layer1_chunk5);
|
||||
|
||||
__m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk3);
|
||||
__m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk3);
|
||||
__m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk4);
|
||||
__m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk4);
|
||||
__m128i layer3_chunk4 = _mm_unpacklo_epi8(layer2_chunk2, layer2_chunk5);
|
||||
__m128i layer3_chunk5 = _mm_unpackhi_epi8(layer2_chunk2, layer2_chunk5);
|
||||
|
||||
__m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk3);
|
||||
__m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk3);
|
||||
__m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk4);
|
||||
__m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk4);
|
||||
__m128i layer4_chunk4 = _mm_unpacklo_epi8(layer3_chunk2, layer3_chunk5);
|
||||
__m128i layer4_chunk5 = _mm_unpackhi_epi8(layer3_chunk2, layer3_chunk5);
|
||||
|
||||
v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk3);
|
||||
v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk3);
|
||||
v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk4);
|
||||
v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk4);
|
||||
v_b0 = _mm_unpacklo_epi8(layer4_chunk2, layer4_chunk5);
|
||||
v_b1 = _mm_unpackhi_epi8(layer4_chunk2, layer4_chunk5);
|
||||
}
|
||||
|
||||
inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
|
||||
__m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
|
||||
{
|
||||
__m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_b0);
|
||||
__m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_b0);
|
||||
__m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_b1);
|
||||
__m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_b1);
|
||||
__m128i layer1_chunk4 = _mm_unpacklo_epi8(v_g0, v_a0);
|
||||
__m128i layer1_chunk5 = _mm_unpackhi_epi8(v_g0, v_a0);
|
||||
__m128i layer1_chunk6 = _mm_unpacklo_epi8(v_g1, v_a1);
|
||||
__m128i layer1_chunk7 = _mm_unpackhi_epi8(v_g1, v_a1);
|
||||
|
||||
__m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk4);
|
||||
__m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk4);
|
||||
__m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk5);
|
||||
__m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk5);
|
||||
__m128i layer2_chunk4 = _mm_unpacklo_epi8(layer1_chunk2, layer1_chunk6);
|
||||
__m128i layer2_chunk5 = _mm_unpackhi_epi8(layer1_chunk2, layer1_chunk6);
|
||||
__m128i layer2_chunk6 = _mm_unpacklo_epi8(layer1_chunk3, layer1_chunk7);
|
||||
__m128i layer2_chunk7 = _mm_unpackhi_epi8(layer1_chunk3, layer1_chunk7);
|
||||
|
||||
__m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk4);
|
||||
__m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk4);
|
||||
__m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk5);
|
||||
__m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk5);
|
||||
__m128i layer3_chunk4 = _mm_unpacklo_epi8(layer2_chunk2, layer2_chunk6);
|
||||
__m128i layer3_chunk5 = _mm_unpackhi_epi8(layer2_chunk2, layer2_chunk6);
|
||||
__m128i layer3_chunk6 = _mm_unpacklo_epi8(layer2_chunk3, layer2_chunk7);
|
||||
__m128i layer3_chunk7 = _mm_unpackhi_epi8(layer2_chunk3, layer2_chunk7);
|
||||
|
||||
__m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk4);
|
||||
__m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk4);
|
||||
__m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk5);
|
||||
__m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk5);
|
||||
__m128i layer4_chunk4 = _mm_unpacklo_epi8(layer3_chunk2, layer3_chunk6);
|
||||
__m128i layer4_chunk5 = _mm_unpackhi_epi8(layer3_chunk2, layer3_chunk6);
|
||||
__m128i layer4_chunk6 = _mm_unpacklo_epi8(layer3_chunk3, layer3_chunk7);
|
||||
__m128i layer4_chunk7 = _mm_unpackhi_epi8(layer3_chunk3, layer3_chunk7);
|
||||
|
||||
v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk4);
|
||||
v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk4);
|
||||
v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk5);
|
||||
v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk5);
|
||||
v_b0 = _mm_unpacklo_epi8(layer4_chunk2, layer4_chunk6);
|
||||
v_b1 = _mm_unpackhi_epi8(layer4_chunk2, layer4_chunk6);
|
||||
v_a0 = _mm_unpacklo_epi8(layer4_chunk3, layer4_chunk7);
|
||||
v_a1 = _mm_unpackhi_epi8(layer4_chunk3, layer4_chunk7);
|
||||
}
|
||||
|
||||
inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
|
||||
{
|
||||
__m128i v_mask = _mm_set1_epi16(0x00ff);
|
||||
|
||||
__m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
|
||||
__m128i layer4_chunk2 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8));
|
||||
__m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
|
||||
__m128i layer4_chunk3 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8));
|
||||
|
||||
__m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask));
|
||||
__m128i layer3_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8));
|
||||
__m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask));
|
||||
__m128i layer3_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8));
|
||||
|
||||
__m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
|
||||
__m128i layer2_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8));
|
||||
__m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
|
||||
__m128i layer2_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8));
|
||||
|
||||
__m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
|
||||
__m128i layer1_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8));
|
||||
__m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
|
||||
__m128i layer1_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8));
|
||||
|
||||
v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
|
||||
v_g0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8));
|
||||
v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
|
||||
v_g1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8));
|
||||
}
|
||||
|
||||
inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
|
||||
__m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
|
||||
{
|
||||
__m128i v_mask = _mm_set1_epi16(0x00ff);
|
||||
|
||||
__m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
|
||||
__m128i layer4_chunk3 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8));
|
||||
__m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
|
||||
__m128i layer4_chunk4 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8));
|
||||
__m128i layer4_chunk2 = _mm_packus_epi16(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
|
||||
__m128i layer4_chunk5 = _mm_packus_epi16(_mm_srli_epi16(v_b0, 8), _mm_srli_epi16(v_b1, 8));
|
||||
|
||||
__m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask));
|
||||
__m128i layer3_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8));
|
||||
__m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask));
|
||||
__m128i layer3_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8));
|
||||
__m128i layer3_chunk2 = _mm_packus_epi16(_mm_and_si128(layer4_chunk4, v_mask), _mm_and_si128(layer4_chunk5, v_mask));
|
||||
__m128i layer3_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk4, 8), _mm_srli_epi16(layer4_chunk5, 8));
|
||||
|
||||
__m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
|
||||
__m128i layer2_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8));
|
||||
__m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
|
||||
__m128i layer2_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8));
|
||||
__m128i layer2_chunk2 = _mm_packus_epi16(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
|
||||
__m128i layer2_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk4, 8), _mm_srli_epi16(layer3_chunk5, 8));
|
||||
|
||||
__m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
|
||||
__m128i layer1_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8));
|
||||
__m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
|
||||
__m128i layer1_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8));
|
||||
__m128i layer1_chunk2 = _mm_packus_epi16(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
|
||||
__m128i layer1_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk4, 8), _mm_srli_epi16(layer2_chunk5, 8));
|
||||
|
||||
v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
|
||||
v_g1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8));
|
||||
v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
|
||||
v_b0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8));
|
||||
v_g0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
|
||||
v_b1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk4, 8), _mm_srli_epi16(layer1_chunk5, 8));
|
||||
}
|
||||
|
||||
inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
|
||||
__m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
|
||||
{
|
||||
__m128i v_mask = _mm_set1_epi16(0x00ff);
|
||||
|
||||
__m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
|
||||
__m128i layer4_chunk4 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8));
|
||||
__m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
|
||||
__m128i layer4_chunk5 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8));
|
||||
__m128i layer4_chunk2 = _mm_packus_epi16(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
|
||||
__m128i layer4_chunk6 = _mm_packus_epi16(_mm_srli_epi16(v_b0, 8), _mm_srli_epi16(v_b1, 8));
|
||||
__m128i layer4_chunk3 = _mm_packus_epi16(_mm_and_si128(v_a0, v_mask), _mm_and_si128(v_a1, v_mask));
|
||||
__m128i layer4_chunk7 = _mm_packus_epi16(_mm_srli_epi16(v_a0, 8), _mm_srli_epi16(v_a1, 8));
|
||||
|
||||
__m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask));
|
||||
__m128i layer3_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8));
|
||||
__m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask));
|
||||
__m128i layer3_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8));
|
||||
__m128i layer3_chunk2 = _mm_packus_epi16(_mm_and_si128(layer4_chunk4, v_mask), _mm_and_si128(layer4_chunk5, v_mask));
|
||||
__m128i layer3_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk4, 8), _mm_srli_epi16(layer4_chunk5, 8));
|
||||
__m128i layer3_chunk3 = _mm_packus_epi16(_mm_and_si128(layer4_chunk6, v_mask), _mm_and_si128(layer4_chunk7, v_mask));
|
||||
__m128i layer3_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk6, 8), _mm_srli_epi16(layer4_chunk7, 8));
|
||||
|
||||
__m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
|
||||
__m128i layer2_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8));
|
||||
__m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
|
||||
__m128i layer2_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8));
|
||||
__m128i layer2_chunk2 = _mm_packus_epi16(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
|
||||
__m128i layer2_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk4, 8), _mm_srli_epi16(layer3_chunk5, 8));
|
||||
__m128i layer2_chunk3 = _mm_packus_epi16(_mm_and_si128(layer3_chunk6, v_mask), _mm_and_si128(layer3_chunk7, v_mask));
|
||||
__m128i layer2_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk6, 8), _mm_srli_epi16(layer3_chunk7, 8));
|
||||
|
||||
__m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
|
||||
__m128i layer1_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8));
|
||||
__m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
|
||||
__m128i layer1_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8));
|
||||
__m128i layer1_chunk2 = _mm_packus_epi16(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
|
||||
__m128i layer1_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk4, 8), _mm_srli_epi16(layer2_chunk5, 8));
|
||||
__m128i layer1_chunk3 = _mm_packus_epi16(_mm_and_si128(layer2_chunk6, v_mask), _mm_and_si128(layer2_chunk7, v_mask));
|
||||
__m128i layer1_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk6, 8), _mm_srli_epi16(layer2_chunk7, 8));
|
||||
|
||||
v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
|
||||
v_b0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8));
|
||||
v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
|
||||
v_b1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8));
|
||||
v_g0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
|
||||
v_a0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk4, 8), _mm_srli_epi16(layer1_chunk5, 8));
|
||||
v_g1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk6, v_mask), _mm_and_si128(layer1_chunk7, v_mask));
|
||||
v_a1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk6, 8), _mm_srli_epi16(layer1_chunk7, 8));
|
||||
}
|
||||
|
||||
inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
|
||||
{
|
||||
__m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_g0);
|
||||
__m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_g0);
|
||||
__m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_g1);
|
||||
__m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_g1);
|
||||
|
||||
__m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk2);
|
||||
__m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk2);
|
||||
__m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk3);
|
||||
__m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk3);
|
||||
|
||||
__m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk2);
|
||||
__m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk2);
|
||||
__m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk3);
|
||||
__m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk3);
|
||||
|
||||
v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk2);
|
||||
v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk2);
|
||||
v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk3);
|
||||
v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk3);
|
||||
}
|
||||
|
||||
inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
|
||||
__m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
|
||||
{
|
||||
__m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_g1);
|
||||
__m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_g1);
|
||||
__m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_b0);
|
||||
__m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_b0);
|
||||
__m128i layer1_chunk4 = _mm_unpacklo_epi16(v_g0, v_b1);
|
||||
__m128i layer1_chunk5 = _mm_unpackhi_epi16(v_g0, v_b1);
|
||||
|
||||
__m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk3);
|
||||
__m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk3);
|
||||
__m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk4);
|
||||
__m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk4);
|
||||
__m128i layer2_chunk4 = _mm_unpacklo_epi16(layer1_chunk2, layer1_chunk5);
|
||||
__m128i layer2_chunk5 = _mm_unpackhi_epi16(layer1_chunk2, layer1_chunk5);
|
||||
|
||||
__m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk3);
|
||||
__m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk3);
|
||||
__m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk4);
|
||||
__m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk4);
|
||||
__m128i layer3_chunk4 = _mm_unpacklo_epi16(layer2_chunk2, layer2_chunk5);
|
||||
__m128i layer3_chunk5 = _mm_unpackhi_epi16(layer2_chunk2, layer2_chunk5);
|
||||
|
||||
v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk3);
|
||||
v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk3);
|
||||
v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk4);
|
||||
v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk4);
|
||||
v_b0 = _mm_unpacklo_epi16(layer3_chunk2, layer3_chunk5);
|
||||
v_b1 = _mm_unpackhi_epi16(layer3_chunk2, layer3_chunk5);
|
||||
}
|
||||
|
||||
inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
|
||||
__m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
|
||||
{
|
||||
__m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_b0);
|
||||
__m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_b0);
|
||||
__m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_b1);
|
||||
__m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_b1);
|
||||
__m128i layer1_chunk4 = _mm_unpacklo_epi16(v_g0, v_a0);
|
||||
__m128i layer1_chunk5 = _mm_unpackhi_epi16(v_g0, v_a0);
|
||||
__m128i layer1_chunk6 = _mm_unpacklo_epi16(v_g1, v_a1);
|
||||
__m128i layer1_chunk7 = _mm_unpackhi_epi16(v_g1, v_a1);
|
||||
|
||||
__m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk4);
|
||||
__m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk4);
|
||||
__m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk5);
|
||||
__m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk5);
|
||||
__m128i layer2_chunk4 = _mm_unpacklo_epi16(layer1_chunk2, layer1_chunk6);
|
||||
__m128i layer2_chunk5 = _mm_unpackhi_epi16(layer1_chunk2, layer1_chunk6);
|
||||
__m128i layer2_chunk6 = _mm_unpacklo_epi16(layer1_chunk3, layer1_chunk7);
|
||||
__m128i layer2_chunk7 = _mm_unpackhi_epi16(layer1_chunk3, layer1_chunk7);
|
||||
|
||||
__m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk4);
|
||||
__m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk4);
|
||||
__m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk5);
|
||||
__m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk5);
|
||||
__m128i layer3_chunk4 = _mm_unpacklo_epi16(layer2_chunk2, layer2_chunk6);
|
||||
__m128i layer3_chunk5 = _mm_unpackhi_epi16(layer2_chunk2, layer2_chunk6);
|
||||
__m128i layer3_chunk6 = _mm_unpacklo_epi16(layer2_chunk3, layer2_chunk7);
|
||||
__m128i layer3_chunk7 = _mm_unpackhi_epi16(layer2_chunk3, layer2_chunk7);
|
||||
|
||||
v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk4);
|
||||
v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk4);
|
||||
v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk5);
|
||||
v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk5);
|
||||
v_b0 = _mm_unpacklo_epi16(layer3_chunk2, layer3_chunk6);
|
||||
v_b1 = _mm_unpackhi_epi16(layer3_chunk2, layer3_chunk6);
|
||||
v_a0 = _mm_unpacklo_epi16(layer3_chunk3, layer3_chunk7);
|
||||
v_a1 = _mm_unpackhi_epi16(layer3_chunk3, layer3_chunk7);
|
||||
}
|
||||
|
||||
#if CV_SSE4_1
|
||||
|
||||
inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
|
||||
{
|
||||
__m128i v_mask = _mm_set1_epi32(0x0000ffff);
|
||||
|
||||
__m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
|
||||
__m128i layer3_chunk2 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16));
|
||||
__m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
|
||||
__m128i layer3_chunk3 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16));
|
||||
|
||||
__m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
|
||||
__m128i layer2_chunk2 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16));
|
||||
__m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
|
||||
__m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16));
|
||||
|
||||
__m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
|
||||
__m128i layer1_chunk2 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16));
|
||||
__m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
|
||||
__m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16));
|
||||
|
||||
v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
|
||||
v_g0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16));
|
||||
v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
|
||||
v_g1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16));
|
||||
}
|
||||
|
||||
inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
|
||||
__m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
|
||||
{
|
||||
__m128i v_mask = _mm_set1_epi32(0x0000ffff);
|
||||
|
||||
__m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
|
||||
__m128i layer3_chunk3 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16));
|
||||
__m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
|
||||
__m128i layer3_chunk4 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16));
|
||||
__m128i layer3_chunk2 = _mm_packus_epi32(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
|
||||
__m128i layer3_chunk5 = _mm_packus_epi32(_mm_srli_epi32(v_b0, 16), _mm_srli_epi32(v_b1, 16));
|
||||
|
||||
__m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
|
||||
__m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16));
|
||||
__m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
|
||||
__m128i layer2_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16));
|
||||
__m128i layer2_chunk2 = _mm_packus_epi32(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
|
||||
__m128i layer2_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk4, 16), _mm_srli_epi32(layer3_chunk5, 16));
|
||||
|
||||
__m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
|
||||
__m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16));
|
||||
__m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
|
||||
__m128i layer1_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16));
|
||||
__m128i layer1_chunk2 = _mm_packus_epi32(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
|
||||
__m128i layer1_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk4, 16), _mm_srli_epi32(layer2_chunk5, 16));
|
||||
|
||||
v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
|
||||
v_g1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16));
|
||||
v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
|
||||
v_b0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16));
|
||||
v_g0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
|
||||
v_b1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk4, 16), _mm_srli_epi32(layer1_chunk5, 16));
|
||||
}
|
||||
|
||||
inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
|
||||
__m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
|
||||
{
|
||||
__m128i v_mask = _mm_set1_epi32(0x0000ffff);
|
||||
|
||||
__m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
|
||||
__m128i layer3_chunk4 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16));
|
||||
__m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
|
||||
__m128i layer3_chunk5 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16));
|
||||
__m128i layer3_chunk2 = _mm_packus_epi32(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
|
||||
__m128i layer3_chunk6 = _mm_packus_epi32(_mm_srli_epi32(v_b0, 16), _mm_srli_epi32(v_b1, 16));
|
||||
__m128i layer3_chunk3 = _mm_packus_epi32(_mm_and_si128(v_a0, v_mask), _mm_and_si128(v_a1, v_mask));
|
||||
__m128i layer3_chunk7 = _mm_packus_epi32(_mm_srli_epi32(v_a0, 16), _mm_srli_epi32(v_a1, 16));
|
||||
|
||||
__m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
|
||||
__m128i layer2_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16));
|
||||
__m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
|
||||
__m128i layer2_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16));
|
||||
__m128i layer2_chunk2 = _mm_packus_epi32(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
|
||||
__m128i layer2_chunk6 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk4, 16), _mm_srli_epi32(layer3_chunk5, 16));
|
||||
__m128i layer2_chunk3 = _mm_packus_epi32(_mm_and_si128(layer3_chunk6, v_mask), _mm_and_si128(layer3_chunk7, v_mask));
|
||||
__m128i layer2_chunk7 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk6, 16), _mm_srli_epi32(layer3_chunk7, 16));
|
||||
|
||||
__m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
|
||||
__m128i layer1_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16));
|
||||
__m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
|
||||
__m128i layer1_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16));
|
||||
__m128i layer1_chunk2 = _mm_packus_epi32(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
|
||||
__m128i layer1_chunk6 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk4, 16), _mm_srli_epi32(layer2_chunk5, 16));
|
||||
__m128i layer1_chunk3 = _mm_packus_epi32(_mm_and_si128(layer2_chunk6, v_mask), _mm_and_si128(layer2_chunk7, v_mask));
|
||||
__m128i layer1_chunk7 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk6, 16), _mm_srli_epi32(layer2_chunk7, 16));
|
||||
|
||||
v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
|
||||
v_b0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16));
|
||||
v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
|
||||
v_b1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16));
|
||||
v_g0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
|
||||
v_a0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk4, 16), _mm_srli_epi32(layer1_chunk5, 16));
|
||||
v_g1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk6, v_mask), _mm_and_si128(layer1_chunk7, v_mask));
|
||||
v_a1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk6, 16), _mm_srli_epi32(layer1_chunk7, 16));
|
||||
}
|
||||
|
||||
#endif // CV_SSE4_1
|
||||
|
||||
inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1)
|
||||
{
|
||||
__m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_g0);
|
||||
__m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_g0);
|
||||
__m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_g1);
|
||||
__m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_g1);
|
||||
|
||||
__m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk2);
|
||||
__m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk2);
|
||||
__m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk3);
|
||||
__m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk3);
|
||||
|
||||
v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk2);
|
||||
v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk2);
|
||||
v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk3);
|
||||
v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk3);
|
||||
}
|
||||
|
||||
inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0,
|
||||
__m128 & v_g1, __m128 & v_b0, __m128 & v_b1)
|
||||
{
|
||||
__m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_g1);
|
||||
__m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_g1);
|
||||
__m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_b0);
|
||||
__m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_b0);
|
||||
__m128 layer1_chunk4 = _mm_unpacklo_ps(v_g0, v_b1);
|
||||
__m128 layer1_chunk5 = _mm_unpackhi_ps(v_g0, v_b1);
|
||||
|
||||
__m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk3);
|
||||
__m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk3);
|
||||
__m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk4);
|
||||
__m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk4);
|
||||
__m128 layer2_chunk4 = _mm_unpacklo_ps(layer1_chunk2, layer1_chunk5);
|
||||
__m128 layer2_chunk5 = _mm_unpackhi_ps(layer1_chunk2, layer1_chunk5);
|
||||
|
||||
v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk3);
|
||||
v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk3);
|
||||
v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk4);
|
||||
v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk4);
|
||||
v_b0 = _mm_unpacklo_ps(layer2_chunk2, layer2_chunk5);
|
||||
v_b1 = _mm_unpackhi_ps(layer2_chunk2, layer2_chunk5);
|
||||
}
|
||||
|
||||
inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1,
|
||||
__m128 & v_b0, __m128 & v_b1, __m128 & v_a0, __m128 & v_a1)
|
||||
{
|
||||
__m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_b0);
|
||||
__m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_b0);
|
||||
__m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_b1);
|
||||
__m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_b1);
|
||||
__m128 layer1_chunk4 = _mm_unpacklo_ps(v_g0, v_a0);
|
||||
__m128 layer1_chunk5 = _mm_unpackhi_ps(v_g0, v_a0);
|
||||
__m128 layer1_chunk6 = _mm_unpacklo_ps(v_g1, v_a1);
|
||||
__m128 layer1_chunk7 = _mm_unpackhi_ps(v_g1, v_a1);
|
||||
|
||||
__m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk4);
|
||||
__m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk4);
|
||||
__m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk5);
|
||||
__m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk5);
|
||||
__m128 layer2_chunk4 = _mm_unpacklo_ps(layer1_chunk2, layer1_chunk6);
|
||||
__m128 layer2_chunk5 = _mm_unpackhi_ps(layer1_chunk2, layer1_chunk6);
|
||||
__m128 layer2_chunk6 = _mm_unpacklo_ps(layer1_chunk3, layer1_chunk7);
|
||||
__m128 layer2_chunk7 = _mm_unpackhi_ps(layer1_chunk3, layer1_chunk7);
|
||||
|
||||
v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk4);
|
||||
v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk4);
|
||||
v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk5);
|
||||
v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk5);
|
||||
v_b0 = _mm_unpacklo_ps(layer2_chunk2, layer2_chunk6);
|
||||
v_b1 = _mm_unpackhi_ps(layer2_chunk2, layer2_chunk6);
|
||||
v_a0 = _mm_unpacklo_ps(layer2_chunk3, layer2_chunk7);
|
||||
v_a1 = _mm_unpackhi_ps(layer2_chunk3, layer2_chunk7);
|
||||
}
|
||||
|
||||
inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1)
|
||||
{
|
||||
const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
|
||||
|
||||
__m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo);
|
||||
__m128 layer2_chunk2 = _mm_shuffle_ps(v_r0, v_r1, mask_hi);
|
||||
__m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo);
|
||||
__m128 layer2_chunk3 = _mm_shuffle_ps(v_g0, v_g1, mask_hi);
|
||||
|
||||
__m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo);
|
||||
__m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi);
|
||||
__m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo);
|
||||
__m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi);
|
||||
|
||||
v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo);
|
||||
v_g0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi);
|
||||
v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo);
|
||||
v_g1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi);
|
||||
}
|
||||
|
||||
inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0,
|
||||
__m128 & v_g1, __m128 & v_b0, __m128 & v_b1)
|
||||
{
|
||||
const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
|
||||
|
||||
__m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo);
|
||||
__m128 layer2_chunk3 = _mm_shuffle_ps(v_r0, v_r1, mask_hi);
|
||||
__m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo);
|
||||
__m128 layer2_chunk4 = _mm_shuffle_ps(v_g0, v_g1, mask_hi);
|
||||
__m128 layer2_chunk2 = _mm_shuffle_ps(v_b0, v_b1, mask_lo);
|
||||
__m128 layer2_chunk5 = _mm_shuffle_ps(v_b0, v_b1, mask_hi);
|
||||
|
||||
__m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo);
|
||||
__m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi);
|
||||
__m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo);
|
||||
__m128 layer1_chunk4 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi);
|
||||
__m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_lo);
|
||||
__m128 layer1_chunk5 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_hi);
|
||||
|
||||
v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo);
|
||||
v_g1 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi);
|
||||
v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo);
|
||||
v_b0 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi);
|
||||
v_g0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_lo);
|
||||
v_b1 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_hi);
|
||||
}
|
||||
|
||||
inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1,
|
||||
__m128 & v_b0, __m128 & v_b1, __m128 & v_a0, __m128 & v_a1)
|
||||
{
|
||||
const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
|
||||
|
||||
__m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo);
|
||||
__m128 layer2_chunk4 = _mm_shuffle_ps(v_r0, v_r1, mask_hi);
|
||||
__m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo);
|
||||
__m128 layer2_chunk5 = _mm_shuffle_ps(v_g0, v_g1, mask_hi);
|
||||
__m128 layer2_chunk2 = _mm_shuffle_ps(v_b0, v_b1, mask_lo);
|
||||
__m128 layer2_chunk6 = _mm_shuffle_ps(v_b0, v_b1, mask_hi);
|
||||
__m128 layer2_chunk3 = _mm_shuffle_ps(v_a0, v_a1, mask_lo);
|
||||
__m128 layer2_chunk7 = _mm_shuffle_ps(v_a0, v_a1, mask_hi);
|
||||
|
||||
__m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo);
|
||||
__m128 layer1_chunk4 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi);
|
||||
__m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo);
|
||||
__m128 layer1_chunk5 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi);
|
||||
__m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_lo);
|
||||
__m128 layer1_chunk6 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_hi);
|
||||
__m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk6, layer2_chunk7, mask_lo);
|
||||
__m128 layer1_chunk7 = _mm_shuffle_ps(layer2_chunk6, layer2_chunk7, mask_hi);
|
||||
|
||||
v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo);
|
||||
v_b0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi);
|
||||
v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo);
|
||||
v_b1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi);
|
||||
v_g0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_lo);
|
||||
v_a0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_hi);
|
||||
v_g1 = _mm_shuffle_ps(layer1_chunk6, layer1_chunk7, mask_lo);
|
||||
v_a1 = _mm_shuffle_ps(layer1_chunk6, layer1_chunk7, mask_hi);
|
||||
}
|
||||
|
||||
#endif // CV_SSE2
|
||||
|
||||
#endif //__OPENCV_CORE_SSE_UTILS_HPP__
|
||||
Reference in New Issue
Block a user