used popcnt

2015-01-12 10:59:30 +03:00
parent 31827d8dfe
commit fc0869735d
25 changed files with 327 additions and 45 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -221,9 +221,9 @@ OCV_OPTION(ENABLE_SSE3                "Enable SSE3 instructions"
 OCV_OPTION(ENABLE_SSSE3               "Enable SSSE3 instructions"                                OFF  IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) )
 OCV_OPTION(ENABLE_SSE41               "Enable SSE4.1 instructions"                               OFF  IF ((CV_ICC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
 OCV_OPTION(ENABLE_SSE42               "Enable SSE4.2 instructions"                               OFF  IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) )
-OCV_OPTION(ENABLE_FMA3                "Enable FMA3 instructions"                                 ON   IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) )
 OCV_OPTION(ENABLE_AVX                 "Enable AVX instructions"                                  OFF  IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
 OCV_OPTION(ENABLE_AVX2                "Enable AVX2 instructions"                                 OFF  IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
+OCV_OPTION(ENABLE_FMA3                "Enable FMA3 instructions"                                 OFF  IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
 OCV_OPTION(ENABLE_NEON                "Enable NEON instructions"                                 OFF  IF CMAKE_COMPILER_IS_GNUCXX AND (ARM OR IOS) )
 OCV_OPTION(ENABLE_VFPV3               "Enable VFPv3-D32 instructions"                            OFF  IF CMAKE_COMPILER_IS_GNUCXX AND (ARM OR IOS) )
 OCV_OPTION(ENABLE_NOISY_WARNINGS      "Show all warnings even if they are too noisy"             OFF )
--- a/cmake/OpenCVCompilerOptions.cmake
+++ b/cmake/OpenCVCompilerOptions.cmake
@@ -145,6 +145,10 @@ if(CMAKE_COMPILER_IS_GNUCXX)
    endif()
    if(ENABLE_AVX2)
      add_extra_compiler_option(-mavx2)
+
+      if(ENABLE_FMA3)
+        add_extra_compiler_option(-mfma)
+      endif()
    endif()

    # GCC depresses SSEx instructions when -mavx is used. Instead, it generates new AVX instructions or AVX equivalence for all SSEx instructions when needed.
@@ -165,10 +169,6 @@ if(CMAKE_COMPILER_IS_GNUCXX)
        add_extra_compiler_option(-msse4.2)
      endif()
    endif()
-
-    if(ENABLE_FMA3)
-      add_extra_compiler_option(-mfma)
-    endif()
  endif(NOT MINGW)

  if(X86 OR X86_64)
--- a/modules/core/include/opencv2/core/base.hpp
+++ b/modules/core/include/opencv2/core/base.hpp
@@ -13,6 +13,7 @@
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2014, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
--- a/modules/core/include/opencv2/core/cvdef.h
+++ b/modules/core/include/opencv2/core/cvdef.h
@@ -13,6 +13,7 @@
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -157,15 +158,11 @@
 #    include <nmmintrin.h>
 #    define CV_SSE4_2 1
 #  endif
-#  if defined __FMA__ || (defined _MSC_VER && _MSC_VER >= 1500)
-#    include <immintrin.h>
-#    define CV_FMA3 1
-#  endif
 #  if defined __POPCNT__ || (defined _MSC_VER && _MSC_VER >= 1500)
 #    include <popcntintrin.h>
 #    define CV_POPCNT 1
 #  endif
-#  if defined __AVX__ || defined __AVX2__ || (defined _MSC_FULL_VER && _MSC_FULL_VER >= 160040219)
+#  if defined __AVX__ || (defined _MSC_FULL_VER && _MSC_FULL_VER >= 160040219)
 // MS Visual Studio 2010 (2012?) has no macro pre-defined to identify the use of /arch:AVX
 // See: http://connect.microsoft.com/VisualStudio/feedback/details/605858/arch-avx-should-define-a-predefined-macro-in-x64-and-set-a-unique-value-for-m-ix86-fp-in-win32
 #    include <immintrin.h>
@@ -179,6 +176,9 @@
 #  if defined __AVX2__ || (defined _MSC_FULL_VER && _MSC_FULL_VER >= 160040219)
 #    include <immintrin.h>
 #    define CV_AVX2 1
+#    if defined __FMA__
+#      define CV_FMA3 1
+#    endif
 #  endif
 #endif

@@ -194,6 +194,9 @@

 #endif // __CUDACC__

+#ifndef CV_POPCNT
+#define CV_POPCNT 0
+#endif
 #ifndef CV_MMX
 #  define CV_MMX 0
 #endif
@@ -221,9 +224,6 @@
 #ifndef CV_AVX2
 #  define CV_AVX2 0
 #endif
-#ifndef CV_POPCNT
-#define CV_POPCNT 0
-#endif
 #ifndef CV_FMA3
 #  define CV_FMA3 0
 #endif
--- a/modules/core/include/opencv2/core/sse_utils.hpp
+++ b/modules/core/include/opencv2/core/sse_utils.hpp
@@ -10,7 +10,7 @@
 //                          License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright (C) 2015, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -48,6 +48,34 @@

 #if CV_SSE2

+inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
+{
+    __m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_g0);
+    __m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_g0);
+    __m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_g1);
+    __m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_g1);
+
+    __m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk2);
+    __m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk2);
+    __m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk3);
+    __m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk3);
+
+    __m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk2);
+    __m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk2);
+    __m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk3);
+    __m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk3);
+
+    __m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk2);
+    __m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk2);
+    __m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk3);
+    __m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk3);
+
+    v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk2);
+    v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk2);
+    v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk3);
+    v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk3);
+}
+
 inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
                                  __m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
 {
@@ -228,6 +256,29 @@ inline void _mm_interleavee_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
    v_a1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk6, 8), _mm_srli_epi16(layer1_chunk7, 8));
 }

+inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1) 
+{                                                                                      
+    __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_g0);
+    __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_g0);
+    __m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_g1);
+    __m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_g1);
+
+    __m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk2);
+    __m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk2);
+    __m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk3);
+    __m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk3);
+
+    __m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk2);
+    __m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk2);
+    __m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk3);
+    __m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk3);
+
+    v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk2);
+    v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk2);
+    v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk3);
+    v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk3);
+}
+
 inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
                                   __m128i & v_g1, __m128i & v_b0, __m128i & v_b1) 
 {                                                                                      
@@ -300,6 +351,8 @@ inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g
    v_a1 = _mm_unpackhi_epi16(layer3_chunk3, layer3_chunk7);
 }

+#if CV_SSE4_1
+
 inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
                                 __m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
 {
@@ -376,6 +429,26 @@ inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
    v_a1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk6, 16), _mm_srli_epi32(layer1_chunk7, 16));
 }

+#endif // CV_SSE4_1
+
+inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1)
+{                                                                      
+    __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_g0);
+    __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_g0);
+    __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_g1);
+    __m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_g1);
+                                                                          
+    __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk2);
+    __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk2);
+    __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk3);
+    __m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk3);
+
+    v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk2);
+    v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk2);
+    v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk3);
+    v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk3);
+}
+
 inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0,
                                __m128 & v_g1, __m128 & v_b0, __m128 & v_b1)
 {                                                                      
@@ -492,6 +565,6 @@ inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m12
    v_a1 = _mm_shuffle_ps(layer1_chunk6, layer1_chunk7, mask_hi);
 }

-#endif
+#endif // CV_SSE2

 #endif //__OPENCV_CORE_SSE_UTILS_HPP__
--- a/modules/core/include/opencv2/core/utility.hpp
+++ b/modules/core/include/opencv2/core/utility.hpp
@@ -13,6 +13,7 @@
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@@ -12,6 +12,7 @@
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
--- a/modules/core/src/convert.cpp
+++ b/modules/core/src/convert.cpp
@@ -12,6 +12,7 @@
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -62,8 +63,11 @@ template<typename T> struct VSplit4;

 #define SPLIT2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
    template<>                                                                    \
-    struct name<data_type>{                                                       \
-        void operator()(const data_type* src, data_type* dst0, data_type* dst1){  \
+    struct name<data_type>                                                        \
+    {                                                                             \
+        void operator()(const data_type* src, data_type* dst0,                    \
+                        data_type* dst1) const                                    \
+        {                                                                         \
            reg_type r = load_func(src);                                          \
            store_func(dst0, r.val[0]);                                           \
            store_func(dst1, r.val[1]);                                           \
@@ -72,9 +76,11 @@ template<typename T> struct VSplit4;

 #define SPLIT3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
    template<>                                                                    \
-    struct name<data_type>{                                                       \
+    struct name<data_type>                                                        \
+    {                                                                             \
        void operator()(const data_type* src, data_type* dst0, data_type* dst1,   \
-                        data_type* dst2){                                         \
+                        data_type* dst2) const                                    \
+        {                                                                         \
            reg_type r = load_func(src);                                          \
            store_func(dst0, r.val[0]);                                           \
            store_func(dst1, r.val[1]);                                           \
@@ -84,9 +90,11 @@ template<typename T> struct VSplit4;

 #define SPLIT4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
    template<>                                                                    \
-    struct name<data_type>{                                                       \
+    struct name<data_type>                                                        \
+    {                                                                             \
        void operator()(const data_type* src, data_type* dst0, data_type* dst1,   \
-                        data_type* dst2, data_type* dst3){                        \
+                        data_type* dst2, data_type* dst3) const                   \
+        {                                                                         \
            reg_type r = load_func(src);                                          \
            store_func(dst0, r.val[0]);                                           \
            store_func(dst1, r.val[1]);                                           \
@@ -96,28 +104,174 @@ template<typename T> struct VSplit4;
    }

 SPLIT2_KERNEL_TEMPLATE(VSplit2, uchar ,  uint8x16x2_t, vld2q_u8 , vst1q_u8 );
-SPLIT2_KERNEL_TEMPLATE(VSplit2, schar ,   int8x16x2_t, vld2q_s8 , vst1q_s8 );
 SPLIT2_KERNEL_TEMPLATE(VSplit2, ushort,  uint16x8x2_t, vld2q_u16, vst1q_u16);
-SPLIT2_KERNEL_TEMPLATE(VSplit2, short ,   int16x8x2_t, vld2q_s16, vst1q_s16);
 SPLIT2_KERNEL_TEMPLATE(VSplit2, int   ,   int32x4x2_t, vld2q_s32, vst1q_s32);
-SPLIT2_KERNEL_TEMPLATE(VSplit2, float , float32x4x2_t, vld2q_f32, vst1q_f32);
 SPLIT2_KERNEL_TEMPLATE(VSplit2, int64 ,   int64x1x2_t, vld2_s64 , vst1_s64 );

 SPLIT3_KERNEL_TEMPLATE(VSplit3, uchar ,  uint8x16x3_t, vld3q_u8 , vst1q_u8 );
-SPLIT3_KERNEL_TEMPLATE(VSplit3, schar ,   int8x16x3_t, vld3q_s8 , vst1q_s8 );
 SPLIT3_KERNEL_TEMPLATE(VSplit3, ushort,  uint16x8x3_t, vld3q_u16, vst1q_u16);
-SPLIT3_KERNEL_TEMPLATE(VSplit3, short ,   int16x8x3_t, vld3q_s16, vst1q_s16);
 SPLIT3_KERNEL_TEMPLATE(VSplit3, int   ,   int32x4x3_t, vld3q_s32, vst1q_s32);
-SPLIT3_KERNEL_TEMPLATE(VSplit3, float , float32x4x3_t, vld3q_f32, vst1q_f32);
 SPLIT3_KERNEL_TEMPLATE(VSplit3, int64 ,   int64x1x3_t, vld3_s64 , vst1_s64 );

 SPLIT4_KERNEL_TEMPLATE(VSplit4, uchar ,  uint8x16x4_t, vld4q_u8 , vst1q_u8 );
-SPLIT4_KERNEL_TEMPLATE(VSplit4, schar ,   int8x16x4_t, vld4q_s8 , vst1q_s8 );
 SPLIT4_KERNEL_TEMPLATE(VSplit4, ushort,  uint16x8x4_t, vld4q_u16, vst1q_u16);
-SPLIT4_KERNEL_TEMPLATE(VSplit4, short ,   int16x8x4_t, vld4q_s16, vst1q_s16);
 SPLIT4_KERNEL_TEMPLATE(VSplit4, int   ,   int32x4x4_t, vld4q_s32, vst1q_s32);
-SPLIT4_KERNEL_TEMPLATE(VSplit4, float , float32x4x4_t, vld4q_f32, vst1q_f32);
 SPLIT4_KERNEL_TEMPLATE(VSplit4, int64 ,   int64x1x4_t, vld4_s64 , vst1_s64 );
+
+#elif CV_SSE2
+
+template <typename T>
+struct VSplit2
+{
+    VSplit2() : support(false) { }
+    void operator()(const T *, T *, T *) const { }
+
+    bool support;
+};
+
+template <typename T>
+struct VSplit3
+{
+    VSplit3() : support(false) { }
+    void operator()(const T *, T *, T *, T *) const { }
+
+    bool support;
+};
+
+template <typename T>
+struct VSplit4
+{
+    VSplit4() : support(false) { }
+    void operator()(const T *, T *, T *, T *, T *) const { }
+
+    bool support;
+};
+
+#define SPLIT2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor)   \
+template <>                                                                                \
+struct VSplit2<data_type>                                                                  \
+{                                                                                          \
+    enum                                                                                   \
+    {                                                                                      \
+        ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
+    };                                                                                     \
+                                                                                           \
+    VSplit2()                                                                              \
+    {                                                                                      \
+        support = true;                                                                    \
+    }                                                                                      \
+                                                                                           \
+    void operator()(const data_type * src,                                                 \
+                    data_type * dst0, data_type * dst1) const                              \
+    {                                                                                      \
+        reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src));                    \
+        reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC));     \
+        reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
+        reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
+                                                                                           \
+        _mm_deinterleave(v_src0, v_src1, v_src2, v_src3);                                  \
+                                                                                           \
+        _mm_storeu_##flavor((cast_type *)(dst0), v_src0);                                  \
+        _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1);                   \
+        _mm_storeu_##flavor((cast_type *)(dst1), v_src2);                                  \
+        _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3);                   \
+    }                                                                                      \
+                                                                                           \
+    bool support;                                                                          \
+}
+
+#define SPLIT3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor)   \
+template <>                                                                                \
+struct VSplit3<data_type>                                                                  \
+{                                                                                          \
+    enum                                                                                   \
+    {                                                                                      \
+        ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
+    };                                                                                     \
+                                                                                           \
+    VSplit3()                                                                              \
+    {                                                                                      \
+        support = true;                                                                    \
+    }                                                                                      \
+                                                                                           \
+    void operator()(const data_type * src,                                                 \
+                    data_type * dst0, data_type * dst1, data_type * dst2) const            \
+    {                                                                                      \
+        reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src));                    \
+        reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC));     \
+        reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
+        reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
+        reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \
+        reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \
+                                                                                           \
+        _mm_deinterleave(v_src0, v_src1, v_src2,                                           \
+                         v_src3, v_src4, v_src5);                                          \
+                                                                                           \
+        _mm_storeu_##flavor((cast_type *)(dst0), v_src0);                                  \
+        _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1);                   \
+        _mm_storeu_##flavor((cast_type *)(dst1), v_src2);                                  \
+        _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3);                   \
+        _mm_storeu_##flavor((cast_type *)(dst2), v_src4);                                  \
+        _mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5);                   \
+    }                                                                                      \
+                                                                                           \
+    bool support;                                                                          \
+}
+
+#define SPLIT4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor)   \
+template <>                                                                                \
+struct VSplit4<data_type>                                                                  \
+{                                                                                          \
+    enum                                                                                   \
+    {                                                                                      \
+        ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
+    };                                                                                     \
+                                                                                           \
+    VSplit4()                                                                              \
+    {                                                                                      \
+        support = true;                                                                    \
+    }                                                                                      \
+                                                                                           \
+    void operator()(const data_type * src, data_type * dst0, data_type * dst1,             \
+                    data_type * dst2, data_type * dst3) const                              \
+    {                                                                                      \
+        reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src));                    \
+        reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC));     \
+        reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
+        reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
+        reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \
+        reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \
+        reg_type v_src6 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 6)); \
+        reg_type v_src7 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 7)); \
+                                                                                           \
+        _mm_deinterleave(v_src0, v_src1, v_src2, v_src3,                                   \
+                         v_src4, v_src5, v_src6, v_src7);                                  \
+                                                                                           \
+        _mm_storeu_##flavor((cast_type *)(dst0), v_src0);                                  \
+        _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1);                   \
+        _mm_storeu_##flavor((cast_type *)(dst1), v_src2);                                  \
+        _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3);                   \
+        _mm_storeu_##flavor((cast_type *)(dst2), v_src4);                                  \
+        _mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5);                   \
+        _mm_storeu_##flavor((cast_type *)(dst3), v_src6);                                  \
+        _mm_storeu_##flavor((cast_type *)(dst3 + ELEMS_IN_VEC), v_src7);                   \
+    }                                                                                      \
+                                                                                           \
+    bool support;                                                                          \
+}
+
+SPLIT2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128);
+SPLIT2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128);
+SPLIT2_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_deinterleave_ps, ps);
+
+SPLIT3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128);
+SPLIT3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128);
+SPLIT3_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_deinterleave_ps, ps);
+
+SPLIT4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128);
+SPLIT4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128);
+SPLIT4_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_deinterleave_ps, ps);
+
 #endif

 template<typename T> static void
@@ -154,6 +308,19 @@ split_( const T* src, T** dst, int len, int cn )
            for( ; i < len - inc_i; i += inc_i, j += inc_j)
                vsplit(src + j, dst0 + i, dst1 + i);
        }
+#elif CV_SSE2
+        if (cn == 2)
+        {
+            int inc_i = 32/sizeof(T);
+            int inc_j = 2 * inc_i;
+
+            VSplit2<T> vsplit;
+            if (vsplit.support)
+            {
+                for( ; i <= len - inc_i; i += inc_i, j += inc_j)
+                    vsplit(src + j, dst0 + i, dst1 + i);
+            }
+        }
 #endif
        for( ; i < len; i++, j += cn )
        {
@@ -176,6 +343,20 @@ split_( const T* src, T** dst, int len, int cn )
            for( ; i <= len - inc_i; i += inc_i, j += inc_j)
                vsplit(src + j, dst0 + i, dst1 + i, dst2 + i);
        }
+#elif CV_SSE2
+        if (cn == 3)
+        {
+            int inc_i = 32/sizeof(T);
+            int inc_j = 3 * inc_i;
+
+            VSplit3<T> vsplit;
+
+            if (vsplit.support)
+            {
+                for( ; i <= len - inc_i; i += inc_i, j += inc_j)
+                    vsplit(src + j, dst0 + i, dst1 + i, dst2 + i);
+            }
+        }
 #endif
        for( ; i < len; i++, j += cn )
        {
@@ -199,6 +380,19 @@ split_( const T* src, T** dst, int len, int cn )
            for( ; i <= len - inc_i; i += inc_i, j += inc_j)
                vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i);
        }
+#elif CV_SSE2
+        if (cn == 4)
+        {
+            int inc_i = 32/sizeof(T);
+            int inc_j = 4 * inc_i;
+
+            VSplit4<T> vsplit;
+            if (vsplit.support)
+            {
+                for( ; i <= len - inc_i; i += inc_i, j += inc_j)
+                    vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i);
+            }
+        }
 #endif
        for( ; i < len; i++, j += cn )
        {
@@ -265,27 +459,18 @@ template<typename T> struct VMerge4;
    }

 MERGE2_KERNEL_TEMPLATE(VMerge2, uchar ,  uint8x16x2_t, vld1q_u8 , vst2q_u8 );
-MERGE2_KERNEL_TEMPLATE(VMerge2, schar ,   int8x16x2_t, vld1q_s8 , vst2q_s8 );
 MERGE2_KERNEL_TEMPLATE(VMerge2, ushort,  uint16x8x2_t, vld1q_u16, vst2q_u16);
-MERGE2_KERNEL_TEMPLATE(VMerge2, short ,   int16x8x2_t, vld1q_s16, vst2q_s16);
 MERGE2_KERNEL_TEMPLATE(VMerge2, int   ,   int32x4x2_t, vld1q_s32, vst2q_s32);
-MERGE2_KERNEL_TEMPLATE(VMerge2, float , float32x4x2_t, vld1q_f32, vst2q_f32);
 MERGE2_KERNEL_TEMPLATE(VMerge2, int64 ,   int64x1x2_t, vld1_s64 , vst2_s64 );

 MERGE3_KERNEL_TEMPLATE(VMerge3, uchar ,  uint8x16x3_t, vld1q_u8 , vst3q_u8 );
-MERGE3_KERNEL_TEMPLATE(VMerge3, schar ,   int8x16x3_t, vld1q_s8 , vst3q_s8 );
 MERGE3_KERNEL_TEMPLATE(VMerge3, ushort,  uint16x8x3_t, vld1q_u16, vst3q_u16);
-MERGE3_KERNEL_TEMPLATE(VMerge3, short ,   int16x8x3_t, vld1q_s16, vst3q_s16);
 MERGE3_KERNEL_TEMPLATE(VMerge3, int   ,   int32x4x3_t, vld1q_s32, vst3q_s32);
-MERGE3_KERNEL_TEMPLATE(VMerge3, float , float32x4x3_t, vld1q_f32, vst3q_f32);
 MERGE3_KERNEL_TEMPLATE(VMerge3, int64 ,   int64x1x3_t, vld1_s64 , vst3_s64 );

 MERGE4_KERNEL_TEMPLATE(VMerge4, uchar ,  uint8x16x4_t, vld1q_u8 , vst4q_u8 );
-MERGE4_KERNEL_TEMPLATE(VMerge4, schar ,   int8x16x4_t, vld1q_s8 , vst4q_s8 );
 MERGE4_KERNEL_TEMPLATE(VMerge4, ushort,  uint16x8x4_t, vld1q_u16, vst4q_u16);
-MERGE4_KERNEL_TEMPLATE(VMerge4, short ,   int16x8x4_t, vld1q_s16, vst4q_s16);
 MERGE4_KERNEL_TEMPLATE(VMerge4, int   ,   int32x4x4_t, vld1q_s32, vst4q_s32);
-MERGE4_KERNEL_TEMPLATE(VMerge4, float , float32x4x4_t, vld1q_f32, vst4q_f32);
 MERGE4_KERNEL_TEMPLATE(VMerge4, int64 ,   int64x1x4_t, vld1_s64 , vst4_s64 );
 #endif

--- a/modules/core/src/copy.cpp
+++ b/modules/core/src/copy.cpp
@@ -11,6 +11,7 @@
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2014, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
--- a/modules/core/src/mathfuncs.cpp
+++ b/modules/core/src/mathfuncs.cpp
@@ -12,6 +12,7 @@
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
--- a/modules/core/src/matmul.cpp
+++ b/modules/core/src/matmul.cpp
@@ -12,6 +12,7 @@
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
--- a/modules/core/src/stat.cpp
+++ b/modules/core/src/stat.cpp
@@ -12,6 +12,7 @@
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -404,13 +405,20 @@ static const uchar * initPopcountTable()
    {
        // we compute inverse popcount table,
        // since we pass (img[x] == 0) mask as index in the table.
-        for( int j = 0; j < 256; j++ )
+        unsigned int j = 0u;
+#if CV_POPCNT
+        if (checkHardwareSupport(CV_CPU_POPCNT))
+            for( ; j < 256u; j++ )
+                tab[j] = (uchar)(8 - _mm_popcnt_u32(j));
+#else
+        for( ; j < 256u; j++ )
        {
            int val = 0;
            for( int mask = 1; mask < 256; mask += mask )
                val += (j & mask) == 0;
            tab[j] = (uchar)val;
        }
+#endif
        initialized = true;
    }

--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@@ -12,6 +12,7 @@
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
--- a/modules/core/src/umatrix.cpp
+++ b/modules/core/src/umatrix.cpp
@@ -10,8 +10,7 @@
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2014, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
--- a/modules/imgproc/src/accum.cpp
+++ b/modules/imgproc/src/accum.cpp
@@ -12,6 +12,7 @@
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2014, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 /
 // Redistribution and use in source and binary forms, with or without modification,
--- a/modules/imgproc/src/canny.cpp
+++ b/modules/imgproc/src/canny.cpp
@@ -11,6 +11,7 @@
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Copyright (C) 2014, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
--- a/modules/imgproc/src/clahe.cpp
+++ b/modules/imgproc/src/clahe.cpp
@@ -11,6 +11,7 @@
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2013, NVIDIA Corporation, all rights reserved.
+// Copyright (C) 2014, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
--- a/modules/imgproc/src/color.cpp
+++ b/modules/imgproc/src/color.cpp
@@ -12,6 +12,7 @@
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009-2010, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
--- a/modules/imgproc/src/corner.cpp
+++ b/modules/imgproc/src/corner.cpp
@@ -12,6 +12,7 @@
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2014, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
--- a/modules/imgproc/src/demosaicing.cpp
+++ b/modules/imgproc/src/demosaicing.cpp
@@ -12,6 +12,7 @@
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009-2010, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2014, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@@ -12,6 +12,7 @@
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
--- a/modules/imgproc/src/pyramids.cpp
+++ b/modules/imgproc/src/pyramids.cpp
@@ -12,6 +12,7 @@
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
--- a/modules/imgproc/src/smooth.cpp
+++ b/modules/imgproc/src/smooth.cpp
@@ -12,6 +12,7 @@
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
--- a/modules/imgproc/src/sumpixels.cpp
+++ b/modules/imgproc/src/sumpixels.cpp
@@ -12,6 +12,7 @@
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2014, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
--- a/modules/ts/src/ts_func.cpp
+++ b/modules/ts/src/ts_func.cpp
@@ -2998,12 +2998,12 @@ void printVersionInfo(bool useStdOut)

    std::string cpu_features;

-#if CV_MMX
-    if (checkHardwareSupport(CV_CPU_MMX)) cpu_features += " mmx";
-#endif
 #if CV_POPCNT
    if (checkHardwareSupport(CV_CPU_POPCNT)) cpu_features += " popcnt";
 #endif
+#if CV_MMX
+    if (checkHardwareSupport(CV_CPU_MMX)) cpu_features += " mmx";
+#endif
 #if CV_SSE
    if (checkHardwareSupport(CV_CPU_SSE)) cpu_features += " sse";
 #endif