used popcnt
This commit is contained in:
parent
31827d8dfe
commit
fc0869735d
@ -221,9 +221,9 @@ OCV_OPTION(ENABLE_SSE3 "Enable SSE3 instructions"
|
||||
OCV_OPTION(ENABLE_SSSE3 "Enable SSSE3 instructions" OFF IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) )
|
||||
OCV_OPTION(ENABLE_SSE41 "Enable SSE4.1 instructions" OFF IF ((CV_ICC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
|
||||
OCV_OPTION(ENABLE_SSE42 "Enable SSE4.2 instructions" OFF IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) )
|
||||
OCV_OPTION(ENABLE_FMA3 "Enable FMA3 instructions" ON IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) )
|
||||
OCV_OPTION(ENABLE_AVX "Enable AVX instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
|
||||
OCV_OPTION(ENABLE_AVX2 "Enable AVX2 instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
|
||||
OCV_OPTION(ENABLE_FMA3 "Enable FMA3 instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
|
||||
OCV_OPTION(ENABLE_NEON "Enable NEON instructions" OFF IF CMAKE_COMPILER_IS_GNUCXX AND (ARM OR IOS) )
|
||||
OCV_OPTION(ENABLE_VFPV3 "Enable VFPv3-D32 instructions" OFF IF CMAKE_COMPILER_IS_GNUCXX AND (ARM OR IOS) )
|
||||
OCV_OPTION(ENABLE_NOISY_WARNINGS "Show all warnings even if they are too noisy" OFF )
|
||||
|
@ -145,6 +145,10 @@ if(CMAKE_COMPILER_IS_GNUCXX)
|
||||
endif()
|
||||
if(ENABLE_AVX2)
|
||||
add_extra_compiler_option(-mavx2)
|
||||
|
||||
if(ENABLE_FMA3)
|
||||
add_extra_compiler_option(-mfma)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# GCC depresses SSEx instructions when -mavx is used. Instead, it generates new AVX instructions or AVX equivalence for all SSEx instructions when needed.
|
||||
@ -165,10 +169,6 @@ if(CMAKE_COMPILER_IS_GNUCXX)
|
||||
add_extra_compiler_option(-msse4.2)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(ENABLE_FMA3)
|
||||
add_extra_compiler_option(-mfma)
|
||||
endif()
|
||||
endif(NOT MINGW)
|
||||
|
||||
if(X86 OR X86_64)
|
||||
|
@ -13,6 +13,7 @@
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
|
||||
// Copyright (C) 2014, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
|
@ -13,6 +13,7 @@
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
|
||||
// Copyright (C) 2015, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
@ -157,15 +158,11 @@
|
||||
# include <nmmintrin.h>
|
||||
# define CV_SSE4_2 1
|
||||
# endif
|
||||
# if defined __FMA__ || (defined _MSC_VER && _MSC_VER >= 1500)
|
||||
# include <immintrin.h>
|
||||
# define CV_FMA3 1
|
||||
# endif
|
||||
# if defined __POPCNT__ || (defined _MSC_VER && _MSC_VER >= 1500)
|
||||
# include <popcntintrin.h>
|
||||
# define CV_POPCNT 1
|
||||
# endif
|
||||
# if defined __AVX__ || defined __AVX2__ || (defined _MSC_FULL_VER && _MSC_FULL_VER >= 160040219)
|
||||
# if defined __AVX__ || (defined _MSC_FULL_VER && _MSC_FULL_VER >= 160040219)
|
||||
// MS Visual Studio 2010 (2012?) has no macro pre-defined to identify the use of /arch:AVX
|
||||
// See: http://connect.microsoft.com/VisualStudio/feedback/details/605858/arch-avx-should-define-a-predefined-macro-in-x64-and-set-a-unique-value-for-m-ix86-fp-in-win32
|
||||
# include <immintrin.h>
|
||||
@ -179,6 +176,9 @@
|
||||
# if defined __AVX2__ || (defined _MSC_FULL_VER && _MSC_FULL_VER >= 160040219)
|
||||
# include <immintrin.h>
|
||||
# define CV_AVX2 1
|
||||
# if defined __FMA__
|
||||
# define CV_FMA3 1
|
||||
# endif
|
||||
# endif
|
||||
#endif
|
||||
|
||||
@ -194,6 +194,9 @@
|
||||
|
||||
#endif // __CUDACC__
|
||||
|
||||
#ifndef CV_POPCNT
|
||||
#define CV_POPCNT 0
|
||||
#endif
|
||||
#ifndef CV_MMX
|
||||
# define CV_MMX 0
|
||||
#endif
|
||||
@ -221,9 +224,6 @@
|
||||
#ifndef CV_AVX2
|
||||
# define CV_AVX2 0
|
||||
#endif
|
||||
#ifndef CV_POPCNT
|
||||
#define CV_POPCNT 0
|
||||
#endif
|
||||
#ifndef CV_FMA3
|
||||
# define CV_FMA3 0
|
||||
#endif
|
||||
|
@ -10,7 +10,7 @@
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2015, OpenCV Foundation, all rights reserved.
|
||||
// Copyright (C) 2015, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
@ -48,6 +48,34 @@
|
||||
|
||||
#if CV_SSE2
|
||||
|
||||
inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
|
||||
{
|
||||
__m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_g0);
|
||||
__m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_g0);
|
||||
__m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_g1);
|
||||
__m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_g1);
|
||||
|
||||
__m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk2);
|
||||
__m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk2);
|
||||
__m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk3);
|
||||
__m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk3);
|
||||
|
||||
__m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk2);
|
||||
__m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk2);
|
||||
__m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk3);
|
||||
__m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk3);
|
||||
|
||||
__m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk2);
|
||||
__m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk2);
|
||||
__m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk3);
|
||||
__m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk3);
|
||||
|
||||
v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk2);
|
||||
v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk2);
|
||||
v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk3);
|
||||
v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk3);
|
||||
}
|
||||
|
||||
inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
|
||||
__m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
|
||||
{
|
||||
@ -228,6 +256,29 @@ inline void _mm_interleavee_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
|
||||
v_a1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk6, 8), _mm_srli_epi16(layer1_chunk7, 8));
|
||||
}
|
||||
|
||||
inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
|
||||
{
|
||||
__m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_g0);
|
||||
__m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_g0);
|
||||
__m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_g1);
|
||||
__m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_g1);
|
||||
|
||||
__m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk2);
|
||||
__m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk2);
|
||||
__m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk3);
|
||||
__m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk3);
|
||||
|
||||
__m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk2);
|
||||
__m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk2);
|
||||
__m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk3);
|
||||
__m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk3);
|
||||
|
||||
v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk2);
|
||||
v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk2);
|
||||
v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk3);
|
||||
v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk3);
|
||||
}
|
||||
|
||||
inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
|
||||
__m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
|
||||
{
|
||||
@ -300,6 +351,8 @@ inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g
|
||||
v_a1 = _mm_unpackhi_epi16(layer3_chunk3, layer3_chunk7);
|
||||
}
|
||||
|
||||
#if CV_SSE4_1
|
||||
|
||||
inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
|
||||
__m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
|
||||
{
|
||||
@ -376,6 +429,26 @@ inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
|
||||
v_a1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk6, 16), _mm_srli_epi32(layer1_chunk7, 16));
|
||||
}
|
||||
|
||||
#endif // CV_SSE4_1
|
||||
|
||||
inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1)
|
||||
{
|
||||
__m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_g0);
|
||||
__m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_g0);
|
||||
__m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_g1);
|
||||
__m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_g1);
|
||||
|
||||
__m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk2);
|
||||
__m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk2);
|
||||
__m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk3);
|
||||
__m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk3);
|
||||
|
||||
v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk2);
|
||||
v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk2);
|
||||
v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk3);
|
||||
v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk3);
|
||||
}
|
||||
|
||||
inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0,
|
||||
__m128 & v_g1, __m128 & v_b0, __m128 & v_b1)
|
||||
{
|
||||
@ -492,6 +565,6 @@ inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m12
|
||||
v_a1 = _mm_shuffle_ps(layer1_chunk6, layer1_chunk7, mask_hi);
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif // CV_SSE2
|
||||
|
||||
#endif //__OPENCV_CORE_SSE_UTILS_HPP__
|
||||
|
@ -13,6 +13,7 @@
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
|
||||
// Copyright (C) 2015, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
|
@ -12,6 +12,7 @@
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
|
@ -12,6 +12,7 @@
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
@ -62,8 +63,11 @@ template<typename T> struct VSplit4;
|
||||
|
||||
#define SPLIT2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \
|
||||
template<> \
|
||||
struct name<data_type>{ \
|
||||
void operator()(const data_type* src, data_type* dst0, data_type* dst1){ \
|
||||
struct name<data_type> \
|
||||
{ \
|
||||
void operator()(const data_type* src, data_type* dst0, \
|
||||
data_type* dst1) const \
|
||||
{ \
|
||||
reg_type r = load_func(src); \
|
||||
store_func(dst0, r.val[0]); \
|
||||
store_func(dst1, r.val[1]); \
|
||||
@ -72,9 +76,11 @@ template<typename T> struct VSplit4;
|
||||
|
||||
#define SPLIT3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \
|
||||
template<> \
|
||||
struct name<data_type>{ \
|
||||
struct name<data_type> \
|
||||
{ \
|
||||
void operator()(const data_type* src, data_type* dst0, data_type* dst1, \
|
||||
data_type* dst2){ \
|
||||
data_type* dst2) const \
|
||||
{ \
|
||||
reg_type r = load_func(src); \
|
||||
store_func(dst0, r.val[0]); \
|
||||
store_func(dst1, r.val[1]); \
|
||||
@ -84,9 +90,11 @@ template<typename T> struct VSplit4;
|
||||
|
||||
#define SPLIT4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \
|
||||
template<> \
|
||||
struct name<data_type>{ \
|
||||
struct name<data_type> \
|
||||
{ \
|
||||
void operator()(const data_type* src, data_type* dst0, data_type* dst1, \
|
||||
data_type* dst2, data_type* dst3){ \
|
||||
data_type* dst2, data_type* dst3) const \
|
||||
{ \
|
||||
reg_type r = load_func(src); \
|
||||
store_func(dst0, r.val[0]); \
|
||||
store_func(dst1, r.val[1]); \
|
||||
@ -96,28 +104,174 @@ template<typename T> struct VSplit4;
|
||||
}
|
||||
|
||||
SPLIT2_KERNEL_TEMPLATE(VSplit2, uchar , uint8x16x2_t, vld2q_u8 , vst1q_u8 );
|
||||
SPLIT2_KERNEL_TEMPLATE(VSplit2, schar , int8x16x2_t, vld2q_s8 , vst1q_s8 );
|
||||
SPLIT2_KERNEL_TEMPLATE(VSplit2, ushort, uint16x8x2_t, vld2q_u16, vst1q_u16);
|
||||
SPLIT2_KERNEL_TEMPLATE(VSplit2, short , int16x8x2_t, vld2q_s16, vst1q_s16);
|
||||
SPLIT2_KERNEL_TEMPLATE(VSplit2, int , int32x4x2_t, vld2q_s32, vst1q_s32);
|
||||
SPLIT2_KERNEL_TEMPLATE(VSplit2, float , float32x4x2_t, vld2q_f32, vst1q_f32);
|
||||
SPLIT2_KERNEL_TEMPLATE(VSplit2, int64 , int64x1x2_t, vld2_s64 , vst1_s64 );
|
||||
|
||||
SPLIT3_KERNEL_TEMPLATE(VSplit3, uchar , uint8x16x3_t, vld3q_u8 , vst1q_u8 );
|
||||
SPLIT3_KERNEL_TEMPLATE(VSplit3, schar , int8x16x3_t, vld3q_s8 , vst1q_s8 );
|
||||
SPLIT3_KERNEL_TEMPLATE(VSplit3, ushort, uint16x8x3_t, vld3q_u16, vst1q_u16);
|
||||
SPLIT3_KERNEL_TEMPLATE(VSplit3, short , int16x8x3_t, vld3q_s16, vst1q_s16);
|
||||
SPLIT3_KERNEL_TEMPLATE(VSplit3, int , int32x4x3_t, vld3q_s32, vst1q_s32);
|
||||
SPLIT3_KERNEL_TEMPLATE(VSplit3, float , float32x4x3_t, vld3q_f32, vst1q_f32);
|
||||
SPLIT3_KERNEL_TEMPLATE(VSplit3, int64 , int64x1x3_t, vld3_s64 , vst1_s64 );
|
||||
|
||||
SPLIT4_KERNEL_TEMPLATE(VSplit4, uchar , uint8x16x4_t, vld4q_u8 , vst1q_u8 );
|
||||
SPLIT4_KERNEL_TEMPLATE(VSplit4, schar , int8x16x4_t, vld4q_s8 , vst1q_s8 );
|
||||
SPLIT4_KERNEL_TEMPLATE(VSplit4, ushort, uint16x8x4_t, vld4q_u16, vst1q_u16);
|
||||
SPLIT4_KERNEL_TEMPLATE(VSplit4, short , int16x8x4_t, vld4q_s16, vst1q_s16);
|
||||
SPLIT4_KERNEL_TEMPLATE(VSplit4, int , int32x4x4_t, vld4q_s32, vst1q_s32);
|
||||
SPLIT4_KERNEL_TEMPLATE(VSplit4, float , float32x4x4_t, vld4q_f32, vst1q_f32);
|
||||
SPLIT4_KERNEL_TEMPLATE(VSplit4, int64 , int64x1x4_t, vld4_s64 , vst1_s64 );
|
||||
|
||||
#elif CV_SSE2
|
||||
|
||||
template <typename T>
|
||||
struct VSplit2
|
||||
{
|
||||
VSplit2() : support(false) { }
|
||||
void operator()(const T *, T *, T *) const { }
|
||||
|
||||
bool support;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct VSplit3
|
||||
{
|
||||
VSplit3() : support(false) { }
|
||||
void operator()(const T *, T *, T *, T *) const { }
|
||||
|
||||
bool support;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct VSplit4
|
||||
{
|
||||
VSplit4() : support(false) { }
|
||||
void operator()(const T *, T *, T *, T *, T *) const { }
|
||||
|
||||
bool support;
|
||||
};
|
||||
|
||||
#define SPLIT2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \
|
||||
template <> \
|
||||
struct VSplit2<data_type> \
|
||||
{ \
|
||||
enum \
|
||||
{ \
|
||||
ELEMS_IN_VEC = 16 / sizeof(data_type) \
|
||||
}; \
|
||||
\
|
||||
VSplit2() \
|
||||
{ \
|
||||
support = true; \
|
||||
} \
|
||||
\
|
||||
void operator()(const data_type * src, \
|
||||
data_type * dst0, data_type * dst1) const \
|
||||
{ \
|
||||
reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \
|
||||
reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \
|
||||
reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
|
||||
reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
|
||||
\
|
||||
_mm_deinterleave(v_src0, v_src1, v_src2, v_src3); \
|
||||
\
|
||||
_mm_storeu_##flavor((cast_type *)(dst0), v_src0); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst1), v_src2); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \
|
||||
} \
|
||||
\
|
||||
bool support; \
|
||||
}
|
||||
|
||||
#define SPLIT3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \
|
||||
template <> \
|
||||
struct VSplit3<data_type> \
|
||||
{ \
|
||||
enum \
|
||||
{ \
|
||||
ELEMS_IN_VEC = 16 / sizeof(data_type) \
|
||||
}; \
|
||||
\
|
||||
VSplit3() \
|
||||
{ \
|
||||
support = true; \
|
||||
} \
|
||||
\
|
||||
void operator()(const data_type * src, \
|
||||
data_type * dst0, data_type * dst1, data_type * dst2) const \
|
||||
{ \
|
||||
reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \
|
||||
reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \
|
||||
reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
|
||||
reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
|
||||
reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \
|
||||
reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \
|
||||
\
|
||||
_mm_deinterleave(v_src0, v_src1, v_src2, \
|
||||
v_src3, v_src4, v_src5); \
|
||||
\
|
||||
_mm_storeu_##flavor((cast_type *)(dst0), v_src0); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst1), v_src2); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst2), v_src4); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5); \
|
||||
} \
|
||||
\
|
||||
bool support; \
|
||||
}
|
||||
|
||||
#define SPLIT4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \
|
||||
template <> \
|
||||
struct VSplit4<data_type> \
|
||||
{ \
|
||||
enum \
|
||||
{ \
|
||||
ELEMS_IN_VEC = 16 / sizeof(data_type) \
|
||||
}; \
|
||||
\
|
||||
VSplit4() \
|
||||
{ \
|
||||
support = true; \
|
||||
} \
|
||||
\
|
||||
void operator()(const data_type * src, data_type * dst0, data_type * dst1, \
|
||||
data_type * dst2, data_type * dst3) const \
|
||||
{ \
|
||||
reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \
|
||||
reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \
|
||||
reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
|
||||
reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
|
||||
reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \
|
||||
reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \
|
||||
reg_type v_src6 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 6)); \
|
||||
reg_type v_src7 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 7)); \
|
||||
\
|
||||
_mm_deinterleave(v_src0, v_src1, v_src2, v_src3, \
|
||||
v_src4, v_src5, v_src6, v_src7); \
|
||||
\
|
||||
_mm_storeu_##flavor((cast_type *)(dst0), v_src0); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst1), v_src2); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst2), v_src4); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst3), v_src6); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst3 + ELEMS_IN_VEC), v_src7); \
|
||||
} \
|
||||
\
|
||||
bool support; \
|
||||
}
|
||||
|
||||
SPLIT2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128);
|
||||
SPLIT2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128);
|
||||
SPLIT2_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps);
|
||||
|
||||
SPLIT3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128);
|
||||
SPLIT3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128);
|
||||
SPLIT3_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps);
|
||||
|
||||
SPLIT4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128);
|
||||
SPLIT4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128);
|
||||
SPLIT4_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps);
|
||||
|
||||
#endif
|
||||
|
||||
template<typename T> static void
|
||||
@ -154,6 +308,19 @@ split_( const T* src, T** dst, int len, int cn )
|
||||
for( ; i < len - inc_i; i += inc_i, j += inc_j)
|
||||
vsplit(src + j, dst0 + i, dst1 + i);
|
||||
}
|
||||
#elif CV_SSE2
|
||||
if (cn == 2)
|
||||
{
|
||||
int inc_i = 32/sizeof(T);
|
||||
int inc_j = 2 * inc_i;
|
||||
|
||||
VSplit2<T> vsplit;
|
||||
if (vsplit.support)
|
||||
{
|
||||
for( ; i <= len - inc_i; i += inc_i, j += inc_j)
|
||||
vsplit(src + j, dst0 + i, dst1 + i);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
for( ; i < len; i++, j += cn )
|
||||
{
|
||||
@ -176,6 +343,20 @@ split_( const T* src, T** dst, int len, int cn )
|
||||
for( ; i <= len - inc_i; i += inc_i, j += inc_j)
|
||||
vsplit(src + j, dst0 + i, dst1 + i, dst2 + i);
|
||||
}
|
||||
#elif CV_SSE2
|
||||
if (cn == 3)
|
||||
{
|
||||
int inc_i = 32/sizeof(T);
|
||||
int inc_j = 3 * inc_i;
|
||||
|
||||
VSplit3<T> vsplit;
|
||||
|
||||
if (vsplit.support)
|
||||
{
|
||||
for( ; i <= len - inc_i; i += inc_i, j += inc_j)
|
||||
vsplit(src + j, dst0 + i, dst1 + i, dst2 + i);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
for( ; i < len; i++, j += cn )
|
||||
{
|
||||
@ -199,6 +380,19 @@ split_( const T* src, T** dst, int len, int cn )
|
||||
for( ; i <= len - inc_i; i += inc_i, j += inc_j)
|
||||
vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i);
|
||||
}
|
||||
#elif CV_SSE2
|
||||
if (cn == 4)
|
||||
{
|
||||
int inc_i = 32/sizeof(T);
|
||||
int inc_j = 4 * inc_i;
|
||||
|
||||
VSplit4<T> vsplit;
|
||||
if (vsplit.support)
|
||||
{
|
||||
for( ; i <= len - inc_i; i += inc_i, j += inc_j)
|
||||
vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
for( ; i < len; i++, j += cn )
|
||||
{
|
||||
@ -265,27 +459,18 @@ template<typename T> struct VMerge4;
|
||||
}
|
||||
|
||||
MERGE2_KERNEL_TEMPLATE(VMerge2, uchar , uint8x16x2_t, vld1q_u8 , vst2q_u8 );
|
||||
MERGE2_KERNEL_TEMPLATE(VMerge2, schar , int8x16x2_t, vld1q_s8 , vst2q_s8 );
|
||||
MERGE2_KERNEL_TEMPLATE(VMerge2, ushort, uint16x8x2_t, vld1q_u16, vst2q_u16);
|
||||
MERGE2_KERNEL_TEMPLATE(VMerge2, short , int16x8x2_t, vld1q_s16, vst2q_s16);
|
||||
MERGE2_KERNEL_TEMPLATE(VMerge2, int , int32x4x2_t, vld1q_s32, vst2q_s32);
|
||||
MERGE2_KERNEL_TEMPLATE(VMerge2, float , float32x4x2_t, vld1q_f32, vst2q_f32);
|
||||
MERGE2_KERNEL_TEMPLATE(VMerge2, int64 , int64x1x2_t, vld1_s64 , vst2_s64 );
|
||||
|
||||
MERGE3_KERNEL_TEMPLATE(VMerge3, uchar , uint8x16x3_t, vld1q_u8 , vst3q_u8 );
|
||||
MERGE3_KERNEL_TEMPLATE(VMerge3, schar , int8x16x3_t, vld1q_s8 , vst3q_s8 );
|
||||
MERGE3_KERNEL_TEMPLATE(VMerge3, ushort, uint16x8x3_t, vld1q_u16, vst3q_u16);
|
||||
MERGE3_KERNEL_TEMPLATE(VMerge3, short , int16x8x3_t, vld1q_s16, vst3q_s16);
|
||||
MERGE3_KERNEL_TEMPLATE(VMerge3, int , int32x4x3_t, vld1q_s32, vst3q_s32);
|
||||
MERGE3_KERNEL_TEMPLATE(VMerge3, float , float32x4x3_t, vld1q_f32, vst3q_f32);
|
||||
MERGE3_KERNEL_TEMPLATE(VMerge3, int64 , int64x1x3_t, vld1_s64 , vst3_s64 );
|
||||
|
||||
MERGE4_KERNEL_TEMPLATE(VMerge4, uchar , uint8x16x4_t, vld1q_u8 , vst4q_u8 );
|
||||
MERGE4_KERNEL_TEMPLATE(VMerge4, schar , int8x16x4_t, vld1q_s8 , vst4q_s8 );
|
||||
MERGE4_KERNEL_TEMPLATE(VMerge4, ushort, uint16x8x4_t, vld1q_u16, vst4q_u16);
|
||||
MERGE4_KERNEL_TEMPLATE(VMerge4, short , int16x8x4_t, vld1q_s16, vst4q_s16);
|
||||
MERGE4_KERNEL_TEMPLATE(VMerge4, int , int32x4x4_t, vld1q_s32, vst4q_s32);
|
||||
MERGE4_KERNEL_TEMPLATE(VMerge4, float , float32x4x4_t, vld1q_f32, vst4q_f32);
|
||||
MERGE4_KERNEL_TEMPLATE(VMerge4, int64 , int64x1x4_t, vld1_s64 , vst4_s64 );
|
||||
#endif
|
||||
|
||||
|
@ -11,6 +11,7 @@
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2014, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
|
@ -12,6 +12,7 @@
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
|
@ -12,6 +12,7 @@
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
|
@ -12,6 +12,7 @@
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
@ -404,13 +405,20 @@ static const uchar * initPopcountTable()
|
||||
{
|
||||
// we compute inverse popcount table,
|
||||
// since we pass (img[x] == 0) mask as index in the table.
|
||||
for( int j = 0; j < 256; j++ )
|
||||
unsigned int j = 0u;
|
||||
#if CV_POPCNT
|
||||
if (checkHardwareSupport(CV_CPU_POPCNT))
|
||||
for( ; j < 256u; j++ )
|
||||
tab[j] = (uchar)(8 - _mm_popcnt_u32(j));
|
||||
#else
|
||||
for( ; j < 256u; j++ )
|
||||
{
|
||||
int val = 0;
|
||||
for( int mask = 1; mask < 256; mask += mask )
|
||||
val += (j & mask) == 0;
|
||||
tab[j] = (uchar)val;
|
||||
}
|
||||
#endif
|
||||
initialized = true;
|
||||
}
|
||||
|
||||
|
@ -12,6 +12,7 @@
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2015, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
|
@ -10,8 +10,7 @@
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2014, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
|
@ -12,6 +12,7 @@
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2014, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
/
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
|
@ -11,6 +11,7 @@
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2014, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
|
@ -11,6 +11,7 @@
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2013, NVIDIA Corporation, all rights reserved.
|
||||
// Copyright (C) 2014, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
|
@ -12,6 +12,7 @@
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009-2010, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
|
@ -12,6 +12,7 @@
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2014, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
|
@ -12,6 +12,7 @@
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009-2010, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2014, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
|
@ -12,6 +12,7 @@
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
|
@ -12,6 +12,7 @@
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
|
@ -12,6 +12,7 @@
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
|
@ -12,6 +12,7 @@
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2014, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
|
@ -2998,12 +2998,12 @@ void printVersionInfo(bool useStdOut)
|
||||
|
||||
std::string cpu_features;
|
||||
|
||||
#if CV_MMX
|
||||
if (checkHardwareSupport(CV_CPU_MMX)) cpu_features += " mmx";
|
||||
#endif
|
||||
#if CV_POPCNT
|
||||
if (checkHardwareSupport(CV_CPU_POPCNT)) cpu_features += " popcnt";
|
||||
#endif
|
||||
#if CV_MMX
|
||||
if (checkHardwareSupport(CV_CPU_MMX)) cpu_features += " mmx";
|
||||
#endif
|
||||
#if CV_SSE
|
||||
if (checkHardwareSupport(CV_CPU_SSE)) cpu_features += " sse";
|
||||
#endif
|
||||
|
Loading…
x
Reference in New Issue
Block a user