Merge pull request #3591 from ilya-lavrenov:sse_avx

This commit is contained in:
Vadim Pisarevsky 2015-01-21 10:46:23 +00:00
commit 03fc3d1ceb
31 changed files with 7617 additions and 205 deletions

View File

@ -216,11 +216,14 @@ OCV_OPTION(ENABLE_POWERPC "Enable PowerPC for GCC"
OCV_OPTION(ENABLE_FAST_MATH "Enable -ffast-math (not recommended for GCC 4.6.x)" OFF IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_FAST_MATH "Enable -ffast-math (not recommended for GCC 4.6.x)" OFF IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) )
OCV_OPTION(ENABLE_SSE "Enable SSE instructions" ON IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_SSE "Enable SSE instructions" ON IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
OCV_OPTION(ENABLE_SSE2 "Enable SSE2 instructions" ON IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_SSE2 "Enable SSE2 instructions" ON IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
OCV_OPTION(ENABLE_SSE3 "Enable SSE3 instructions" ON IF ((CV_ICC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_SSE3 "Enable SSE3 instructions" ON IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX OR CV_ICC) AND (X86 OR X86_64)) )
OCV_OPTION(ENABLE_SSSE3 "Enable SSSE3 instructions" OFF IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_SSSE3 "Enable SSSE3 instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
OCV_OPTION(ENABLE_SSE41 "Enable SSE4.1 instructions" OFF IF ((CV_ICC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_SSE41 "Enable SSE4.1 instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX OR CV_ICC) AND (X86 OR X86_64)) )
OCV_OPTION(ENABLE_SSE42 "Enable SSE4.2 instructions" OFF IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_SSE42 "Enable SSE4.2 instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
OCV_OPTION(ENABLE_POPCNT "Enable POPCNT instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
OCV_OPTION(ENABLE_AVX "Enable AVX instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_AVX "Enable AVX instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
OCV_OPTION(ENABLE_AVX2 "Enable AVX2 instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
OCV_OPTION(ENABLE_FMA3 "Enable FMA3 instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
OCV_OPTION(ENABLE_NEON "Enable NEON instructions" OFF IF CMAKE_COMPILER_IS_GNUCXX AND (ARM OR IOS) ) OCV_OPTION(ENABLE_NEON "Enable NEON instructions" OFF IF CMAKE_COMPILER_IS_GNUCXX AND (ARM OR IOS) )
OCV_OPTION(ENABLE_VFPV3 "Enable VFPv3-D32 instructions" OFF IF CMAKE_COMPILER_IS_GNUCXX AND (ARM OR IOS) ) OCV_OPTION(ENABLE_VFPV3 "Enable VFPv3-D32 instructions" OFF IF CMAKE_COMPILER_IS_GNUCXX AND (ARM OR IOS) )
OCV_OPTION(ENABLE_NOISY_WARNINGS "Show all warnings even if they are too noisy" OFF ) OCV_OPTION(ENABLE_NOISY_WARNINGS "Show all warnings even if they are too noisy" OFF )

View File

@ -128,10 +128,10 @@ if(CMAKE_COMPILER_IS_GNUCXX)
if(ENABLE_SSE2) if(ENABLE_SSE2)
add_extra_compiler_option(-msse2) add_extra_compiler_option(-msse2)
endif() endif()
if (ENABLE_NEON) if(ENABLE_NEON)
add_extra_compiler_option("-mfpu=neon") add_extra_compiler_option("-mfpu=neon")
endif() endif()
if (ENABLE_VFPV3 AND NOT ENABLE_NEON) if(ENABLE_VFPV3 AND NOT ENABLE_NEON)
add_extra_compiler_option("-mfpu=vfpv3") add_extra_compiler_option("-mfpu=vfpv3")
endif() endif()
@ -140,6 +140,13 @@ if(CMAKE_COMPILER_IS_GNUCXX)
if(ENABLE_AVX) if(ENABLE_AVX)
add_extra_compiler_option(-mavx) add_extra_compiler_option(-mavx)
endif() endif()
if(ENABLE_AVX2)
add_extra_compiler_option(-mavx2)
if(ENABLE_FMA3)
add_extra_compiler_option(-mfma)
endif()
endif()
# GCC depresses SSEx instructions when -mavx is used. Instead, it generates new AVX instructions or AVX equivalence for all SSEx instructions when needed. # GCC depresses SSEx instructions when -mavx is used. Instead, it generates new AVX instructions or AVX equivalence for all SSEx instructions when needed.
if(NOT OPENCV_EXTRA_CXX_FLAGS MATCHES "-mavx") if(NOT OPENCV_EXTRA_CXX_FLAGS MATCHES "-mavx")
@ -158,6 +165,10 @@ if(CMAKE_COMPILER_IS_GNUCXX)
if(ENABLE_SSE42) if(ENABLE_SSE42)
add_extra_compiler_option(-msse4.2) add_extra_compiler_option(-msse4.2)
endif() endif()
if(ENABLE_POPCNT)
add_extra_compiler_option(-mpopcnt)
endif()
endif() endif()
endif(NOT MINGW) endif(NOT MINGW)
@ -214,7 +225,10 @@ if(MSVC)
set(OPENCV_EXTRA_FLAGS_RELEASE "${OPENCV_EXTRA_FLAGS_RELEASE} /Zi") set(OPENCV_EXTRA_FLAGS_RELEASE "${OPENCV_EXTRA_FLAGS_RELEASE} /Zi")
endif() endif()
if(ENABLE_AVX AND NOT MSVC_VERSION LESS 1600) if(ENABLE_AVX2 AND NOT MSVC_VERSION LESS 1800)
set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /arch:AVX2")
endif()
if(ENABLE_AVX AND NOT MSVC_VERSION LESS 1600 AND NOT OPENCV_EXTRA_FLAGS MATCHES "/arch:")
set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /arch:AVX") set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /arch:AVX")
endif() endif()
@ -236,7 +250,7 @@ if(MSVC)
endif() endif()
endif() endif()
if(ENABLE_SSE OR ENABLE_SSE2 OR ENABLE_SSE3 OR ENABLE_SSE4_1 OR ENABLE_AVX) if(ENABLE_SSE OR ENABLE_SSE2 OR ENABLE_SSE3 OR ENABLE_SSE4_1 OR ENABLE_AVX OR ENABLE_AVX2)
set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /Oi") set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /Oi")
endif() endif()
@ -308,6 +322,7 @@ if(MSVC)
endforeach() endforeach()
if(NOT ENABLE_NOISY_WARNINGS) if(NOT ENABLE_NOISY_WARNINGS)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4251") #class 'std::XXX' needs to have dll-interface to be used by clients of YYY ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4251) # class 'std::XXX' needs to have dll-interface to be used by clients of YYY
ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4324) # 'struct_name' : structure was padded due to __declspec(align())
endif() endif()
endif() endif()

View File

@ -13,6 +13,7 @@
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 2013, OpenCV Foundation, all rights reserved. // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Copyright (C) 2014, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners. // Third party copyrights are property of their respective owners.
// //
// Redistribution and use in source and binary forms, with or without modification, // Redistribution and use in source and binary forms, with or without modification,
@ -813,4 +814,6 @@ inline float32x2_t cv_vsqrt_f32(float32x2_t val)
} // cv } // cv
#include "sse_utils.hpp"
#endif //__OPENCV_CORE_BASE_HPP__ #endif //__OPENCV_CORE_BASE_HPP__

View File

@ -13,6 +13,7 @@
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 2013, OpenCV Foundation, all rights reserved. // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Copyright (C) 2015, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners. // Third party copyrights are property of their respective owners.
// //
// Redistribution and use in source and binary forms, with or without modification, // Redistribution and use in source and binary forms, with or without modification,
@ -104,17 +105,32 @@
#endif #endif
/* CPU features and intrinsics support */ /* CPU features and intrinsics support */
#define CV_CPU_NONE 0 #define CV_CPU_NONE 0
#define CV_CPU_MMX 1 #define CV_CPU_MMX 1
#define CV_CPU_SSE 2 #define CV_CPU_SSE 2
#define CV_CPU_SSE2 3 #define CV_CPU_SSE2 3
#define CV_CPU_SSE3 4 #define CV_CPU_SSE3 4
#define CV_CPU_SSSE3 5 #define CV_CPU_SSSE3 5
#define CV_CPU_SSE4_1 6 #define CV_CPU_SSE4_1 6
#define CV_CPU_SSE4_2 7 #define CV_CPU_SSE4_2 7
#define CV_CPU_POPCNT 8 #define CV_CPU_POPCNT 8
#define CV_CPU_AVX 10
#define CV_CPU_NEON 11 #define CV_CPU_AVX 10
#define CV_CPU_AVX2 11
#define CV_CPU_FMA3 12
#define CV_CPU_AVX_512F 13
#define CV_CPU_AVX_512BW 14
#define CV_CPU_AVX_512CD 15
#define CV_CPU_AVX_512DQ 16
#define CV_CPU_AVX_512ER 17
#define CV_CPU_AVX_512IFMA512 18
#define CV_CPU_AVX_512PF 19
#define CV_CPU_AVX_512VBMI 20
#define CV_CPU_AVX_512VL 21
#define CV_CPU_NEON 100
// when adding to this list remember to update the enum in core/utility.cpp // when adding to this list remember to update the enum in core/utility.cpp
#define CV_HARDWARE_MAX_FEATURE 255 #define CV_HARDWARE_MAX_FEATURE 255
@ -123,6 +139,7 @@
#if defined __SSE2__ || defined _M_X64 || (defined _M_IX86_FP && _M_IX86_FP >= 2) #if defined __SSE2__ || defined _M_X64 || (defined _M_IX86_FP && _M_IX86_FP >= 2)
# include <emmintrin.h> # include <emmintrin.h>
# define CV_MMX 1
# define CV_SSE 1 # define CV_SSE 1
# define CV_SSE2 1 # define CV_SSE2 1
# if defined __SSE3__ || (defined _MSC_VER && _MSC_VER >= 1500) # if defined __SSE3__ || (defined _MSC_VER && _MSC_VER >= 1500)
@ -141,7 +158,15 @@
# include <nmmintrin.h> # include <nmmintrin.h>
# define CV_SSE4_2 1 # define CV_SSE4_2 1
# endif # endif
# if defined __AVX__ || (defined _MSC_FULL_VER && _MSC_FULL_VER >= 160040219) # if defined __POPCNT__ || (defined _MSC_VER && _MSC_VER >= 1500)
# ifdef _MSC_VER
# include <nmmintrin.h>
# else
# include <popcntintrin.h>
# endif
# define CV_POPCNT 1
# endif
# if defined __AVX__ || (defined _MSC_VER && _MSC_VER >= 1600 && 0)
// MS Visual Studio 2010 (2012?) has no macro pre-defined to identify the use of /arch:AVX // MS Visual Studio 2010 (2012?) has no macro pre-defined to identify the use of /arch:AVX
// See: http://connect.microsoft.com/VisualStudio/feedback/details/605858/arch-avx-should-define-a-predefined-macro-in-x64-and-set-a-unique-value-for-m-ix86-fp-in-win32 // See: http://connect.microsoft.com/VisualStudio/feedback/details/605858/arch-avx-should-define-a-predefined-macro-in-x64-and-set-a-unique-value-for-m-ix86-fp-in-win32
# include <immintrin.h> # include <immintrin.h>
@ -152,6 +177,13 @@
# define __xgetbv() 0 # define __xgetbv() 0
# endif # endif
# endif # endif
# if defined __AVX2__ || (defined _MSC_VER && _MSC_VER >= 1800 && 0)
# include <immintrin.h>
# define CV_AVX2 1
# if defined __FMA__
# define CV_FMA3 1
# endif
# endif
#endif #endif
#if (defined WIN32 || defined _WIN32) && defined(_M_ARM) #if (defined WIN32 || defined _WIN32) && defined(_M_ARM)
@ -166,6 +198,12 @@
#endif // __CUDACC__ #endif // __CUDACC__
#ifndef CV_POPCNT
#define CV_POPCNT 0
#endif
#ifndef CV_MMX
# define CV_MMX 0
#endif
#ifndef CV_SSE #ifndef CV_SSE
# define CV_SSE 0 # define CV_SSE 0
#endif #endif
@ -187,6 +225,40 @@
#ifndef CV_AVX #ifndef CV_AVX
# define CV_AVX 0 # define CV_AVX 0
#endif #endif
#ifndef CV_AVX2
# define CV_AVX2 0
#endif
#ifndef CV_FMA3
# define CV_FMA3 0
#endif
#ifndef CV_AVX_512F
# define CV_AVX_512F 0
#endif
#ifndef CV_AVX_512BW
# define CV_AVX_512BW 0
#endif
#ifndef CV_AVX_512CD
# define CV_AVX_512CD 0
#endif
#ifndef CV_AVX_512DQ
# define CV_AVX_512DQ 0
#endif
#ifndef CV_AVX_512ER
# define CV_AVX_512ER 0
#endif
#ifndef CV_AVX_512IFMA512
# define CV_AVX_512IFMA512 0
#endif
#ifndef CV_AVX_512PF
# define CV_AVX_512PF 0
#endif
#ifndef CV_AVX_512VBMI
# define CV_AVX_512VBMI 0
#endif
#ifndef CV_AVX_512VL
# define CV_AVX_512VL 0
#endif
#ifndef CV_NEON #ifndef CV_NEON
# define CV_NEON 0 # define CV_NEON 0
#endif #endif

View File

@ -0,0 +1,645 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2015, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __OPENCV_CORE_SSE_UTILS_HPP__
#define __OPENCV_CORE_SSE_UTILS_HPP__
#ifndef __cplusplus
# error sse_utils.hpp header must be compiled as C++
#endif
#if CV_SSE2
inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
{
__m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_g0);
__m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_g0);
__m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_g1);
__m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_g1);
__m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk2);
__m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk2);
__m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk3);
__m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk3);
__m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk2);
__m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk2);
__m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk3);
__m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk3);
__m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk2);
__m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk2);
__m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk3);
__m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk3);
v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk2);
v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk2);
v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk3);
v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk3);
}
inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
__m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
{
__m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_g1);
__m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_g1);
__m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_b0);
__m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_b0);
__m128i layer1_chunk4 = _mm_unpacklo_epi8(v_g0, v_b1);
__m128i layer1_chunk5 = _mm_unpackhi_epi8(v_g0, v_b1);
__m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk3);
__m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk3);
__m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk4);
__m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk4);
__m128i layer2_chunk4 = _mm_unpacklo_epi8(layer1_chunk2, layer1_chunk5);
__m128i layer2_chunk5 = _mm_unpackhi_epi8(layer1_chunk2, layer1_chunk5);
__m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk3);
__m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk3);
__m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk4);
__m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk4);
__m128i layer3_chunk4 = _mm_unpacklo_epi8(layer2_chunk2, layer2_chunk5);
__m128i layer3_chunk5 = _mm_unpackhi_epi8(layer2_chunk2, layer2_chunk5);
__m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk3);
__m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk3);
__m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk4);
__m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk4);
__m128i layer4_chunk4 = _mm_unpacklo_epi8(layer3_chunk2, layer3_chunk5);
__m128i layer4_chunk5 = _mm_unpackhi_epi8(layer3_chunk2, layer3_chunk5);
v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk3);
v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk3);
v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk4);
v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk4);
v_b0 = _mm_unpacklo_epi8(layer4_chunk2, layer4_chunk5);
v_b1 = _mm_unpackhi_epi8(layer4_chunk2, layer4_chunk5);
}
inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
__m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
{
__m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_b0);
__m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_b0);
__m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_b1);
__m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_b1);
__m128i layer1_chunk4 = _mm_unpacklo_epi8(v_g0, v_a0);
__m128i layer1_chunk5 = _mm_unpackhi_epi8(v_g0, v_a0);
__m128i layer1_chunk6 = _mm_unpacklo_epi8(v_g1, v_a1);
__m128i layer1_chunk7 = _mm_unpackhi_epi8(v_g1, v_a1);
__m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk4);
__m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk4);
__m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk5);
__m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk5);
__m128i layer2_chunk4 = _mm_unpacklo_epi8(layer1_chunk2, layer1_chunk6);
__m128i layer2_chunk5 = _mm_unpackhi_epi8(layer1_chunk2, layer1_chunk6);
__m128i layer2_chunk6 = _mm_unpacklo_epi8(layer1_chunk3, layer1_chunk7);
__m128i layer2_chunk7 = _mm_unpackhi_epi8(layer1_chunk3, layer1_chunk7);
__m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk4);
__m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk4);
__m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk5);
__m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk5);
__m128i layer3_chunk4 = _mm_unpacklo_epi8(layer2_chunk2, layer2_chunk6);
__m128i layer3_chunk5 = _mm_unpackhi_epi8(layer2_chunk2, layer2_chunk6);
__m128i layer3_chunk6 = _mm_unpacklo_epi8(layer2_chunk3, layer2_chunk7);
__m128i layer3_chunk7 = _mm_unpackhi_epi8(layer2_chunk3, layer2_chunk7);
__m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk4);
__m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk4);
__m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk5);
__m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk5);
__m128i layer4_chunk4 = _mm_unpacklo_epi8(layer3_chunk2, layer3_chunk6);
__m128i layer4_chunk5 = _mm_unpackhi_epi8(layer3_chunk2, layer3_chunk6);
__m128i layer4_chunk6 = _mm_unpacklo_epi8(layer3_chunk3, layer3_chunk7);
__m128i layer4_chunk7 = _mm_unpackhi_epi8(layer3_chunk3, layer3_chunk7);
v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk4);
v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk4);
v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk5);
v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk5);
v_b0 = _mm_unpacklo_epi8(layer4_chunk2, layer4_chunk6);
v_b1 = _mm_unpackhi_epi8(layer4_chunk2, layer4_chunk6);
v_a0 = _mm_unpacklo_epi8(layer4_chunk3, layer4_chunk7);
v_a1 = _mm_unpackhi_epi8(layer4_chunk3, layer4_chunk7);
}
inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
{
__m128i v_mask = _mm_set1_epi16(0x00ff);
__m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
__m128i layer4_chunk2 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8));
__m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
__m128i layer4_chunk3 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8));
__m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask));
__m128i layer3_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8));
__m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask));
__m128i layer3_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8));
__m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
__m128i layer2_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8));
__m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
__m128i layer2_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8));
__m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
__m128i layer1_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8));
__m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
__m128i layer1_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8));
v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
v_g0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8));
v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
v_g1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8));
}
inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
__m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
{
__m128i v_mask = _mm_set1_epi16(0x00ff);
__m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
__m128i layer4_chunk3 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8));
__m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
__m128i layer4_chunk4 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8));
__m128i layer4_chunk2 = _mm_packus_epi16(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
__m128i layer4_chunk5 = _mm_packus_epi16(_mm_srli_epi16(v_b0, 8), _mm_srli_epi16(v_b1, 8));
__m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask));
__m128i layer3_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8));
__m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask));
__m128i layer3_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8));
__m128i layer3_chunk2 = _mm_packus_epi16(_mm_and_si128(layer4_chunk4, v_mask), _mm_and_si128(layer4_chunk5, v_mask));
__m128i layer3_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk4, 8), _mm_srli_epi16(layer4_chunk5, 8));
__m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
__m128i layer2_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8));
__m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
__m128i layer2_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8));
__m128i layer2_chunk2 = _mm_packus_epi16(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
__m128i layer2_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk4, 8), _mm_srli_epi16(layer3_chunk5, 8));
__m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
__m128i layer1_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8));
__m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
__m128i layer1_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8));
__m128i layer1_chunk2 = _mm_packus_epi16(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
__m128i layer1_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk4, 8), _mm_srli_epi16(layer2_chunk5, 8));
v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
v_g1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8));
v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
v_b0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8));
v_g0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
v_b1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk4, 8), _mm_srli_epi16(layer1_chunk5, 8));
}
inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
__m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
{
__m128i v_mask = _mm_set1_epi16(0x00ff);
__m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
__m128i layer4_chunk4 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8));
__m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
__m128i layer4_chunk5 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8));
__m128i layer4_chunk2 = _mm_packus_epi16(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
__m128i layer4_chunk6 = _mm_packus_epi16(_mm_srli_epi16(v_b0, 8), _mm_srli_epi16(v_b1, 8));
__m128i layer4_chunk3 = _mm_packus_epi16(_mm_and_si128(v_a0, v_mask), _mm_and_si128(v_a1, v_mask));
__m128i layer4_chunk7 = _mm_packus_epi16(_mm_srli_epi16(v_a0, 8), _mm_srli_epi16(v_a1, 8));
__m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask));
__m128i layer3_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8));
__m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask));
__m128i layer3_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8));
__m128i layer3_chunk2 = _mm_packus_epi16(_mm_and_si128(layer4_chunk4, v_mask), _mm_and_si128(layer4_chunk5, v_mask));
__m128i layer3_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk4, 8), _mm_srli_epi16(layer4_chunk5, 8));
__m128i layer3_chunk3 = _mm_packus_epi16(_mm_and_si128(layer4_chunk6, v_mask), _mm_and_si128(layer4_chunk7, v_mask));
__m128i layer3_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk6, 8), _mm_srli_epi16(layer4_chunk7, 8));
__m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
__m128i layer2_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8));
__m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
__m128i layer2_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8));
__m128i layer2_chunk2 = _mm_packus_epi16(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
__m128i layer2_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk4, 8), _mm_srli_epi16(layer3_chunk5, 8));
__m128i layer2_chunk3 = _mm_packus_epi16(_mm_and_si128(layer3_chunk6, v_mask), _mm_and_si128(layer3_chunk7, v_mask));
__m128i layer2_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk6, 8), _mm_srli_epi16(layer3_chunk7, 8));
__m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
__m128i layer1_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8));
__m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
__m128i layer1_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8));
__m128i layer1_chunk2 = _mm_packus_epi16(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
__m128i layer1_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk4, 8), _mm_srli_epi16(layer2_chunk5, 8));
__m128i layer1_chunk3 = _mm_packus_epi16(_mm_and_si128(layer2_chunk6, v_mask), _mm_and_si128(layer2_chunk7, v_mask));
__m128i layer1_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk6, 8), _mm_srli_epi16(layer2_chunk7, 8));
v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
v_b0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8));
v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
v_b1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8));
v_g0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
v_a0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk4, 8), _mm_srli_epi16(layer1_chunk5, 8));
v_g1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk6, v_mask), _mm_and_si128(layer1_chunk7, v_mask));
v_a1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk6, 8), _mm_srli_epi16(layer1_chunk7, 8));
}
inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
{
__m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_g0);
__m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_g0);
__m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_g1);
__m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_g1);
__m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk2);
__m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk2);
__m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk3);
__m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk3);
__m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk2);
__m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk2);
__m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk3);
__m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk3);
v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk2);
v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk2);
v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk3);
v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk3);
}
inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
__m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
{
__m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_g1);
__m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_g1);
__m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_b0);
__m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_b0);
__m128i layer1_chunk4 = _mm_unpacklo_epi16(v_g0, v_b1);
__m128i layer1_chunk5 = _mm_unpackhi_epi16(v_g0, v_b1);
__m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk3);
__m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk3);
__m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk4);
__m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk4);
__m128i layer2_chunk4 = _mm_unpacklo_epi16(layer1_chunk2, layer1_chunk5);
__m128i layer2_chunk5 = _mm_unpackhi_epi16(layer1_chunk2, layer1_chunk5);
__m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk3);
__m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk3);
__m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk4);
__m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk4);
__m128i layer3_chunk4 = _mm_unpacklo_epi16(layer2_chunk2, layer2_chunk5);
__m128i layer3_chunk5 = _mm_unpackhi_epi16(layer2_chunk2, layer2_chunk5);
v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk3);
v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk3);
v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk4);
v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk4);
v_b0 = _mm_unpacklo_epi16(layer3_chunk2, layer3_chunk5);
v_b1 = _mm_unpackhi_epi16(layer3_chunk2, layer3_chunk5);
}
inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
__m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
{
__m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_b0);
__m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_b0);
__m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_b1);
__m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_b1);
__m128i layer1_chunk4 = _mm_unpacklo_epi16(v_g0, v_a0);
__m128i layer1_chunk5 = _mm_unpackhi_epi16(v_g0, v_a0);
__m128i layer1_chunk6 = _mm_unpacklo_epi16(v_g1, v_a1);
__m128i layer1_chunk7 = _mm_unpackhi_epi16(v_g1, v_a1);
__m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk4);
__m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk4);
__m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk5);
__m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk5);
__m128i layer2_chunk4 = _mm_unpacklo_epi16(layer1_chunk2, layer1_chunk6);
__m128i layer2_chunk5 = _mm_unpackhi_epi16(layer1_chunk2, layer1_chunk6);
__m128i layer2_chunk6 = _mm_unpacklo_epi16(layer1_chunk3, layer1_chunk7);
__m128i layer2_chunk7 = _mm_unpackhi_epi16(layer1_chunk3, layer1_chunk7);
__m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk4);
__m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk4);
__m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk5);
__m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk5);
__m128i layer3_chunk4 = _mm_unpacklo_epi16(layer2_chunk2, layer2_chunk6);
__m128i layer3_chunk5 = _mm_unpackhi_epi16(layer2_chunk2, layer2_chunk6);
__m128i layer3_chunk6 = _mm_unpacklo_epi16(layer2_chunk3, layer2_chunk7);
__m128i layer3_chunk7 = _mm_unpackhi_epi16(layer2_chunk3, layer2_chunk7);
v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk4);
v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk4);
v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk5);
v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk5);
v_b0 = _mm_unpacklo_epi16(layer3_chunk2, layer3_chunk6);
v_b1 = _mm_unpackhi_epi16(layer3_chunk2, layer3_chunk6);
v_a0 = _mm_unpacklo_epi16(layer3_chunk3, layer3_chunk7);
v_a1 = _mm_unpackhi_epi16(layer3_chunk3, layer3_chunk7);
}
#if CV_SSE4_1
inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
{
__m128i v_mask = _mm_set1_epi32(0x0000ffff);
__m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
__m128i layer3_chunk2 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16));
__m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
__m128i layer3_chunk3 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16));
__m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
__m128i layer2_chunk2 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16));
__m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
__m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16));
__m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
__m128i layer1_chunk2 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16));
__m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
__m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16));
v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
v_g0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16));
v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
v_g1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16));
}
inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
__m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
{
__m128i v_mask = _mm_set1_epi32(0x0000ffff);
__m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
__m128i layer3_chunk3 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16));
__m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
__m128i layer3_chunk4 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16));
__m128i layer3_chunk2 = _mm_packus_epi32(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
__m128i layer3_chunk5 = _mm_packus_epi32(_mm_srli_epi32(v_b0, 16), _mm_srli_epi32(v_b1, 16));
__m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
__m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16));
__m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
__m128i layer2_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16));
__m128i layer2_chunk2 = _mm_packus_epi32(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
__m128i layer2_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk4, 16), _mm_srli_epi32(layer3_chunk5, 16));
__m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
__m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16));
__m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
__m128i layer1_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16));
__m128i layer1_chunk2 = _mm_packus_epi32(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
__m128i layer1_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk4, 16), _mm_srli_epi32(layer2_chunk5, 16));
v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
v_g1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16));
v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
v_b0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16));
v_g0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
v_b1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk4, 16), _mm_srli_epi32(layer1_chunk5, 16));
}
inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
__m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
{
__m128i v_mask = _mm_set1_epi32(0x0000ffff);
__m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
__m128i layer3_chunk4 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16));
__m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
__m128i layer3_chunk5 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16));
__m128i layer3_chunk2 = _mm_packus_epi32(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
__m128i layer3_chunk6 = _mm_packus_epi32(_mm_srli_epi32(v_b0, 16), _mm_srli_epi32(v_b1, 16));
__m128i layer3_chunk3 = _mm_packus_epi32(_mm_and_si128(v_a0, v_mask), _mm_and_si128(v_a1, v_mask));
__m128i layer3_chunk7 = _mm_packus_epi32(_mm_srli_epi32(v_a0, 16), _mm_srli_epi32(v_a1, 16));
__m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
__m128i layer2_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16));
__m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
__m128i layer2_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16));
__m128i layer2_chunk2 = _mm_packus_epi32(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
__m128i layer2_chunk6 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk4, 16), _mm_srli_epi32(layer3_chunk5, 16));
__m128i layer2_chunk3 = _mm_packus_epi32(_mm_and_si128(layer3_chunk6, v_mask), _mm_and_si128(layer3_chunk7, v_mask));
__m128i layer2_chunk7 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk6, 16), _mm_srli_epi32(layer3_chunk7, 16));
__m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
__m128i layer1_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16));
__m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
__m128i layer1_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16));
__m128i layer1_chunk2 = _mm_packus_epi32(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
__m128i layer1_chunk6 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk4, 16), _mm_srli_epi32(layer2_chunk5, 16));
__m128i layer1_chunk3 = _mm_packus_epi32(_mm_and_si128(layer2_chunk6, v_mask), _mm_and_si128(layer2_chunk7, v_mask));
__m128i layer1_chunk7 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk6, 16), _mm_srli_epi32(layer2_chunk7, 16));
v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
v_b0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16));
v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
v_b1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16));
v_g0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
v_a0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk4, 16), _mm_srli_epi32(layer1_chunk5, 16));
v_g1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk6, v_mask), _mm_and_si128(layer1_chunk7, v_mask));
v_a1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk6, 16), _mm_srli_epi32(layer1_chunk7, 16));
}
#endif // CV_SSE4_1
inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1)
{
__m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_g0);
__m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_g0);
__m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_g1);
__m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_g1);
__m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk2);
__m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk2);
__m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk3);
__m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk3);
v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk2);
v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk2);
v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk3);
v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk3);
}
inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0,
__m128 & v_g1, __m128 & v_b0, __m128 & v_b1)
{
__m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_g1);
__m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_g1);
__m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_b0);
__m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_b0);
__m128 layer1_chunk4 = _mm_unpacklo_ps(v_g0, v_b1);
__m128 layer1_chunk5 = _mm_unpackhi_ps(v_g0, v_b1);
__m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk3);
__m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk3);
__m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk4);
__m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk4);
__m128 layer2_chunk4 = _mm_unpacklo_ps(layer1_chunk2, layer1_chunk5);
__m128 layer2_chunk5 = _mm_unpackhi_ps(layer1_chunk2, layer1_chunk5);
v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk3);
v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk3);
v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk4);
v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk4);
v_b0 = _mm_unpacklo_ps(layer2_chunk2, layer2_chunk5);
v_b1 = _mm_unpackhi_ps(layer2_chunk2, layer2_chunk5);
}
inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1,
__m128 & v_b0, __m128 & v_b1, __m128 & v_a0, __m128 & v_a1)
{
__m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_b0);
__m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_b0);
__m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_b1);
__m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_b1);
__m128 layer1_chunk4 = _mm_unpacklo_ps(v_g0, v_a0);
__m128 layer1_chunk5 = _mm_unpackhi_ps(v_g0, v_a0);
__m128 layer1_chunk6 = _mm_unpacklo_ps(v_g1, v_a1);
__m128 layer1_chunk7 = _mm_unpackhi_ps(v_g1, v_a1);
__m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk4);
__m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk4);
__m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk5);
__m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk5);
__m128 layer2_chunk4 = _mm_unpacklo_ps(layer1_chunk2, layer1_chunk6);
__m128 layer2_chunk5 = _mm_unpackhi_ps(layer1_chunk2, layer1_chunk6);
__m128 layer2_chunk6 = _mm_unpacklo_ps(layer1_chunk3, layer1_chunk7);
__m128 layer2_chunk7 = _mm_unpackhi_ps(layer1_chunk3, layer1_chunk7);
v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk4);
v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk4);
v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk5);
v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk5);
v_b0 = _mm_unpacklo_ps(layer2_chunk2, layer2_chunk6);
v_b1 = _mm_unpackhi_ps(layer2_chunk2, layer2_chunk6);
v_a0 = _mm_unpacklo_ps(layer2_chunk3, layer2_chunk7);
v_a1 = _mm_unpackhi_ps(layer2_chunk3, layer2_chunk7);
}
inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1)
{
const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
__m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo);
__m128 layer2_chunk2 = _mm_shuffle_ps(v_r0, v_r1, mask_hi);
__m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo);
__m128 layer2_chunk3 = _mm_shuffle_ps(v_g0, v_g1, mask_hi);
__m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo);
__m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi);
__m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo);
__m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi);
v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo);
v_g0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi);
v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo);
v_g1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi);
}
inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0,
__m128 & v_g1, __m128 & v_b0, __m128 & v_b1)
{
const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
__m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo);
__m128 layer2_chunk3 = _mm_shuffle_ps(v_r0, v_r1, mask_hi);
__m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo);
__m128 layer2_chunk4 = _mm_shuffle_ps(v_g0, v_g1, mask_hi);
__m128 layer2_chunk2 = _mm_shuffle_ps(v_b0, v_b1, mask_lo);
__m128 layer2_chunk5 = _mm_shuffle_ps(v_b0, v_b1, mask_hi);
__m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo);
__m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi);
__m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo);
__m128 layer1_chunk4 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi);
__m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_lo);
__m128 layer1_chunk5 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_hi);
v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo);
v_g1 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi);
v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo);
v_b0 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi);
v_g0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_lo);
v_b1 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_hi);
}
inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1,
__m128 & v_b0, __m128 & v_b1, __m128 & v_a0, __m128 & v_a1)
{
const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
__m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo);
__m128 layer2_chunk4 = _mm_shuffle_ps(v_r0, v_r1, mask_hi);
__m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo);
__m128 layer2_chunk5 = _mm_shuffle_ps(v_g0, v_g1, mask_hi);
__m128 layer2_chunk2 = _mm_shuffle_ps(v_b0, v_b1, mask_lo);
__m128 layer2_chunk6 = _mm_shuffle_ps(v_b0, v_b1, mask_hi);
__m128 layer2_chunk3 = _mm_shuffle_ps(v_a0, v_a1, mask_lo);
__m128 layer2_chunk7 = _mm_shuffle_ps(v_a0, v_a1, mask_hi);
__m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo);
__m128 layer1_chunk4 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi);
__m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo);
__m128 layer1_chunk5 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi);
__m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_lo);
__m128 layer1_chunk6 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_hi);
__m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk6, layer2_chunk7, mask_lo);
__m128 layer1_chunk7 = _mm_shuffle_ps(layer2_chunk6, layer2_chunk7, mask_hi);
v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo);
v_b0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi);
v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo);
v_b1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi);
v_g0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_lo);
v_a0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_hi);
v_g1 = _mm_shuffle_ps(layer1_chunk6, layer1_chunk7, mask_lo);
v_a1 = _mm_shuffle_ps(layer1_chunk6, layer1_chunk7, mask_hi);
}
#endif // CV_SSE2
#endif //__OPENCV_CORE_SSE_UTILS_HPP__

View File

@ -13,6 +13,7 @@
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 2013, OpenCV Foundation, all rights reserved. // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Copyright (C) 2015, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners. // Third party copyrights are property of their respective owners.
// //
// Redistribution and use in source and binary forms, with or without modification, // Redistribution and use in source and binary forms, with or without modification,
@ -281,16 +282,30 @@ CV_EXPORTS_W int64 getCPUTickCount();
remember to keep this list identical to the one in cvdef.h remember to keep this list identical to the one in cvdef.h
*/ */
enum CpuFeatures { enum CpuFeatures {
CPU_MMX = 1, CPU_MMX = 1,
CPU_SSE = 2, CPU_SSE = 2,
CPU_SSE2 = 3, CPU_SSE2 = 3,
CPU_SSE3 = 4, CPU_SSE3 = 4,
CPU_SSSE3 = 5, CPU_SSSE3 = 5,
CPU_SSE4_1 = 6, CPU_SSE4_1 = 6,
CPU_SSE4_2 = 7, CPU_SSE4_2 = 7,
CPU_POPCNT = 8, CPU_POPCNT = 8,
CPU_AVX = 10,
CPU_NEON = 11 CPU_AVX = 10,
CPU_AVX2 = 11,
CPU_FMA3 = 12,
CPU_AVX_512F = 13,
CPU_AVX_512BW = 14,
CPU_AVX_512CD = 15,
CPU_AVX_512DQ = 16,
CPU_AVX_512ER = 17,
CPU_AVX_512IFMA512 = 18,
CPU_AVX_512PF = 19,
CPU_AVX_512VBMI = 20,
CPU_AVX_512VL = 21,
CPU_NEON = 100
}; };
/** @brief Returns true if the specified feature is supported by the host hardware. /** @brief Returns true if the specified feature is supported by the host hardware.

View File

@ -242,3 +242,31 @@ PERF_TEST_P(Size_MatType, multiplyScale, TYPICAL_MATS_CORE_ARITHM)
SANITY_CHECK(c, 1e-8); SANITY_CHECK(c, 1e-8);
} }
PERF_TEST_P(Size_MatType, divide, TYPICAL_MATS_CORE_ARITHM)
{
Size sz = get<0>(GetParam());
int type = get<1>(GetParam());
cv::Mat a(sz, type), b(sz, type), c(sz, type);
double scale = 0.5;
declare.in(a, b, WARMUP_RNG).out(c);
TEST_CYCLE() divide(a, b, c, scale);
SANITY_CHECK_NOTHING();
}
PERF_TEST_P(Size_MatType, reciprocal, TYPICAL_MATS_CORE_ARITHM)
{
Size sz = get<0>(GetParam());
int type = get<1>(GetParam());
cv::Mat b(sz, type), c(sz, type);
double scale = 0.5;
declare.in(b, WARMUP_RNG).out(c);
TEST_CYCLE() divide(scale, b, c);
SANITY_CHECK_NOTHING();
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -11,6 +11,7 @@
// //
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
// Copyright (C) 2014, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners. // Third party copyrights are property of their respective owners.
// //
// Redistribution and use in source and binary forms, with or without modification, // Redistribution and use in source and binary forms, with or without modification,

View File

@ -12,6 +12,7 @@
// //
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners. // Third party copyrights are property of their respective owners.
// //
// Redistribution and use in source and binary forms, with or without modification, // Redistribution and use in source and binary forms, with or without modification,
@ -593,14 +594,46 @@ void phase( InputArray src1, InputArray src2, OutputArray dst, bool angleInDegre
{ {
const double *x = (const double*)ptrs[0], *y = (const double*)ptrs[1]; const double *x = (const double*)ptrs[0], *y = (const double*)ptrs[1];
double *angle = (double*)ptrs[2]; double *angle = (double*)ptrs[2];
for( k = 0; k < len; k++ ) k = 0;
#if CV_SSE2
if (USE_SSE2)
{
for ( ; k <= len - 4; k += 4)
{
__m128 v_dst0 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(x + k)),
_mm_cvtpd_ps(_mm_loadu_pd(x + k + 2)));
__m128 v_dst1 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(y + k)),
_mm_cvtpd_ps(_mm_loadu_pd(y + k + 2)));
_mm_storeu_ps(buf[0] + k, v_dst0);
_mm_storeu_ps(buf[1] + k, v_dst1);
}
}
#endif
for( ; k < len; k++ )
{ {
buf[0][k] = (float)x[k]; buf[0][k] = (float)x[k];
buf[1][k] = (float)y[k]; buf[1][k] = (float)y[k];
} }
FastAtan2_32f( buf[1], buf[0], buf[0], len, angleInDegrees ); FastAtan2_32f( buf[1], buf[0], buf[0], len, angleInDegrees );
for( k = 0; k < len; k++ ) k = 0;
#if CV_SSE2
if (USE_SSE2)
{
for ( ; k <= len - 4; k += 4)
{
__m128 v_src = _mm_loadu_ps(buf[0] + k);
_mm_storeu_pd(angle + k, _mm_cvtps_pd(v_src));
_mm_storeu_pd(angle + k + 2, _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8))));
}
}
#endif
for( ; k < len; k++ )
angle[k] = buf[0][k]; angle[k] = buf[0][k];
} }
ptrs[0] += len*esz1; ptrs[0] += len*esz1;
@ -698,14 +731,46 @@ void cartToPolar( InputArray src1, InputArray src2,
double *angle = (double*)ptrs[3]; double *angle = (double*)ptrs[3];
Magnitude_64f(x, y, (double*)ptrs[2], len); Magnitude_64f(x, y, (double*)ptrs[2], len);
for( k = 0; k < len; k++ ) k = 0;
#if CV_SSE2
if (USE_SSE2)
{
for ( ; k <= len - 4; k += 4)
{
__m128 v_dst0 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(x + k)),
_mm_cvtpd_ps(_mm_loadu_pd(x + k + 2)));
__m128 v_dst1 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(y + k)),
_mm_cvtpd_ps(_mm_loadu_pd(y + k + 2)));
_mm_storeu_ps(buf[0] + k, v_dst0);
_mm_storeu_ps(buf[1] + k, v_dst1);
}
}
#endif
for( ; k < len; k++ )
{ {
buf[0][k] = (float)x[k]; buf[0][k] = (float)x[k];
buf[1][k] = (float)y[k]; buf[1][k] = (float)y[k];
} }
FastAtan2_32f( buf[1], buf[0], buf[0], len, angleInDegrees ); FastAtan2_32f( buf[1], buf[0], buf[0], len, angleInDegrees );
for( k = 0; k < len; k++ ) k = 0;
#if CV_SSE2
if (USE_SSE2)
{
for ( ; k <= len - 4; k += 4)
{
__m128 v_src = _mm_loadu_ps(buf[0] + k);
_mm_storeu_pd(angle + k, _mm_cvtps_pd(v_src));
_mm_storeu_pd(angle + k + 2, _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8))));
}
}
#endif
for( ; k < len; k++ )
angle[k] = buf[0][k]; angle[k] = buf[0][k];
} }
ptrs[0] += len*esz1; ptrs[0] += len*esz1;
@ -771,14 +836,77 @@ static void SinCos_32f( const float *angle, float *sinval, float* cosval,
/*static const double cos_a2 = 1;*/ /*static const double cos_a2 = 1;*/
double k1; double k1;
int i; int i = 0;
if( !angle_in_degrees ) if( !angle_in_degrees )
k1 = N/(2*CV_PI); k1 = N/(2*CV_PI);
else else
k1 = N/360.; k1 = N/360.;
for( i = 0; i < len; i++ ) #if CV_AVX2
if (USE_AVX2)
{
__m128d v_k1 = _mm_set1_pd(k1);
__m128d v_1 = _mm_set1_pd(1);
__m128i v_N1 = _mm_set1_epi32(N - 1);
__m128i v_N4 = _mm_set1_epi32(N >> 2);
__m128d v_sin_a0 = _mm_set1_pd(sin_a0);
__m128d v_sin_a2 = _mm_set1_pd(sin_a2);
__m128d v_cos_a0 = _mm_set1_pd(cos_a0);
for ( ; i <= len - 4; i += 4)
{
__m128 v_angle = _mm_loadu_ps(angle + i);
// 0-1
__m128d v_t = _mm_mul_pd(_mm_cvtps_pd(v_angle), v_k1);
__m128i v_it = _mm_cvtpd_epi32(v_t);
v_t = _mm_sub_pd(v_t, _mm_cvtepi32_pd(v_it));
__m128i v_sin_idx = _mm_and_si128(v_it, v_N1);
__m128i v_cos_idx = _mm_and_si128(_mm_sub_epi32(v_N4, v_sin_idx), v_N1);
__m128d v_t2 = _mm_mul_pd(v_t, v_t);
__m128d v_sin_b = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(v_sin_a0, v_t2), v_sin_a2), v_t);
__m128d v_cos_b = _mm_add_pd(_mm_mul_pd(v_cos_a0, v_t2), v_1);
__m128d v_sin_a = _mm_i32gather_pd(sin_table, v_sin_idx, 8);
__m128d v_cos_a = _mm_i32gather_pd(sin_table, v_cos_idx, 8);
__m128d v_sin_val_0 = _mm_add_pd(_mm_mul_pd(v_sin_a, v_cos_b),
_mm_mul_pd(v_cos_a, v_sin_b));
__m128d v_cos_val_0 = _mm_sub_pd(_mm_mul_pd(v_cos_a, v_cos_b),
_mm_mul_pd(v_sin_a, v_sin_b));
// 2-3
v_t = _mm_mul_pd(_mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_angle), 8))), v_k1);
v_it = _mm_cvtpd_epi32(v_t);
v_t = _mm_sub_pd(v_t, _mm_cvtepi32_pd(v_it));
v_sin_idx = _mm_and_si128(v_it, v_N1);
v_cos_idx = _mm_and_si128(_mm_sub_epi32(v_N4, v_sin_idx), v_N1);
v_t2 = _mm_mul_pd(v_t, v_t);
v_sin_b = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(v_sin_a0, v_t2), v_sin_a2), v_t);
v_cos_b = _mm_add_pd(_mm_mul_pd(v_cos_a0, v_t2), v_1);
v_sin_a = _mm_i32gather_pd(sin_table, v_sin_idx, 8);
v_cos_a = _mm_i32gather_pd(sin_table, v_cos_idx, 8);
__m128d v_sin_val_1 = _mm_add_pd(_mm_mul_pd(v_sin_a, v_cos_b),
_mm_mul_pd(v_cos_a, v_sin_b));
__m128d v_cos_val_1 = _mm_sub_pd(_mm_mul_pd(v_cos_a, v_cos_b),
_mm_mul_pd(v_sin_a, v_sin_b));
_mm_storeu_ps(sinval + i, _mm_movelh_ps(_mm_cvtpd_ps(v_sin_val_0),
_mm_cvtpd_ps(v_sin_val_1)));
_mm_storeu_ps(cosval + i, _mm_movelh_ps(_mm_cvtpd_ps(v_cos_val_0),
_mm_cvtpd_ps(v_cos_val_1)));
}
}
#endif
for( ; i < len; i++ )
{ {
double t = angle[i]*k1; double t = angle[i]*k1;
int it = cvRound(t); int it = cvRound(t);
@ -914,6 +1042,16 @@ void polarToCart( InputArray src1, InputArray src2,
vst1q_f32(x + k, vmulq_f32(vld1q_f32(x + k), v_m)); vst1q_f32(x + k, vmulq_f32(vld1q_f32(x + k), v_m));
vst1q_f32(y + k, vmulq_f32(vld1q_f32(y + k), v_m)); vst1q_f32(y + k, vmulq_f32(vld1q_f32(y + k), v_m));
} }
#elif CV_SSE2
if (USE_SSE2)
{
for( ; k <= len - 4; k += 4 )
{
__m128 v_m = _mm_loadu_ps(mag + k);
_mm_storeu_ps(x + k, _mm_mul_ps(_mm_loadu_ps(x + k), v_m));
_mm_storeu_ps(y + k, _mm_mul_ps(_mm_loadu_ps(y + k), v_m));
}
}
#endif #endif
for( ; k < len; k++ ) for( ; k < len; k++ )
@ -939,10 +1077,10 @@ void polarToCart( InputArray src1, InputArray src2,
x[k] = buf[0][k]*m; y[k] = buf[1][k]*m; x[k] = buf[0][k]*m; y[k] = buf[1][k]*m;
} }
else else
for( k = 0; k < len; k++ ) {
{ std::memcpy(x, buf[0], sizeof(float) * len);
x[k] = buf[0][k]; y[k] = buf[1][k]; std::memcpy(y, buf[1], sizeof(float) * len);
} }
} }
if( ptrs[0] ) if( ptrs[0] )

View File

@ -12,6 +12,7 @@
// //
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners. // Third party copyrights are property of their respective owners.
// //
// Redistribution and use in source and binary forms, with or without modification, // Redistribution and use in source and binary forms, with or without modification,

View File

@ -192,6 +192,7 @@ struct NoVec
extern volatile bool USE_SSE2; extern volatile bool USE_SSE2;
extern volatile bool USE_SSE4_2; extern volatile bool USE_SSE4_2;
extern volatile bool USE_AVX; extern volatile bool USE_AVX;
extern volatile bool USE_AVX2;
enum { BLOCK_SIZE = 1024 }; enum { BLOCK_SIZE = 1024 };

View File

@ -12,6 +12,7 @@
// //
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners. // Third party copyrights are property of their respective owners.
// //
// Redistribution and use in source and binary forms, with or without modification, // Redistribution and use in source and binary forms, with or without modification,
@ -72,7 +73,114 @@ struct Sum_SIMD
} }
}; };
#if CV_NEON #if CV_SSE2
template <>
struct Sum_SIMD<schar, int>
{
int operator () (const schar * src0, const uchar * mask, int * dst, int len, int cn) const
{
if (mask || (cn != 1 && cn != 2 && cn != 4) || !USE_SSE2)
return 0;
int x = 0;
__m128i v_zero = _mm_setzero_si128(), v_sum = v_zero;
for ( ; x <= len - 16; x += 16)
{
__m128i v_src = _mm_loadu_si128((const __m128i *)(src0 + x));
__m128i v_half = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src), 8);
v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_half), 16));
v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_half), 16));
v_half = _mm_srai_epi16(_mm_unpackhi_epi8(v_zero, v_src), 8);
v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_half), 16));
v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_half), 16));
}
for ( ; x <= len - 8; x += 8)
{
__m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src0 + x))), 8);
v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
}
int CV_DECL_ALIGNED(16) ar[4];
_mm_store_si128((__m128i*)ar, v_sum);
for (int i = 0; i < 4; i += cn)
for (int j = 0; j < cn; ++j)
dst[j] += ar[j + i];
return x / cn;
}
};
template <>
struct Sum_SIMD<int, double>
{
int operator () (const int * src0, const uchar * mask, double * dst, int len, int cn) const
{
if (mask || (cn != 1 && cn != 2 && cn != 4) || !USE_SSE2)
return 0;
int x = 0;
__m128d v_zero = _mm_setzero_pd(), v_sum0 = v_zero, v_sum1 = v_zero;
for ( ; x <= len - 4; x += 4)
{
__m128i v_src = _mm_loadu_si128((__m128i const *)(src0 + x));
v_sum0 = _mm_add_pd(v_sum0, _mm_cvtepi32_pd(v_src));
v_sum1 = _mm_add_pd(v_sum1, _mm_cvtepi32_pd(_mm_srli_si128(v_src, 8)));
}
double CV_DECL_ALIGNED(16) ar[4];
_mm_store_pd(ar, v_sum0);
_mm_store_pd(ar + 2, v_sum1);
for (int i = 0; i < 4; i += cn)
for (int j = 0; j < cn; ++j)
dst[j] += ar[j + i];
return x / cn;
}
};
template <>
struct Sum_SIMD<float, double>
{
int operator () (const float * src0, const uchar * mask, double * dst, int len, int cn) const
{
if (mask || (cn != 1 && cn != 2 && cn != 4) || !USE_SSE2)
return 0;
int x = 0;
__m128d v_zero = _mm_setzero_pd(), v_sum0 = v_zero, v_sum1 = v_zero;
for ( ; x <= len - 4; x += 4)
{
__m128 v_src = _mm_loadu_ps(src0 + x);
v_sum0 = _mm_add_pd(v_sum0, _mm_cvtps_pd(v_src));
v_src = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8));
v_sum1 = _mm_add_pd(v_sum1, _mm_cvtps_pd(v_src));
}
double CV_DECL_ALIGNED(16) ar[4];
_mm_store_pd(ar, v_sum0);
_mm_store_pd(ar + 2, v_sum1);
for (int i = 0; i < 4; i += cn)
for (int j = 0; j < cn; ++j)
dst[j] += ar[j + i];
return x / cn;
}
};
#elif CV_NEON
template <> template <>
struct Sum_SIMD<uchar, int> struct Sum_SIMD<uchar, int>
@ -396,6 +504,38 @@ static int countNonZero_(const T* src, int len )
return nz; return nz;
} }
#if CV_SSE2
static const uchar * initPopcountTable()
{
static uchar tab[256];
static volatile bool initialized = false;
if( !initialized )
{
// we compute inverse popcount table,
// since we pass (img[x] == 0) mask as index in the table.
unsigned int j = 0u;
#if CV_POPCNT
if (checkHardwareSupport(CV_CPU_POPCNT))
for( ; j < 256u; j++ )
tab[j] = (uchar)(8 - _mm_popcnt_u32(j));
#else
for( ; j < 256u; j++ )
{
int val = 0;
for( int mask = 1; mask < 256; mask += mask )
val += (j & mask) == 0;
tab[j] = (uchar)val;
}
#endif
initialized = true;
}
return tab;
}
#endif
static int countNonZero8u( const uchar* src, int len ) static int countNonZero8u( const uchar* src, int len )
{ {
int i=0, nz = 0; int i=0, nz = 0;
@ -403,21 +543,7 @@ static int countNonZero8u( const uchar* src, int len )
if(USE_SSE2)//5x-6x if(USE_SSE2)//5x-6x
{ {
__m128i pattern = _mm_setzero_si128 (); __m128i pattern = _mm_setzero_si128 ();
static uchar tab[256]; static const uchar * tab = initPopcountTable();
static volatile bool initialized = false;
if( !initialized )
{
// we compute inverse popcount table,
// since we pass (img[x] == 0) mask as index in the table.
for( int j = 0; j < 256; j++ )
{
int val = 0;
for( int mask = 1; mask < 256; mask += mask )
val += (j & mask) == 0;
tab[j] = (uchar)val;
}
initialized = true;
}
for (; i<=len-16; i+=16) for (; i<=len-16; i+=16)
{ {
@ -467,7 +593,22 @@ static int countNonZero8u( const uchar* src, int len )
static int countNonZero16u( const ushort* src, int len ) static int countNonZero16u( const ushort* src, int len )
{ {
int i = 0, nz = 0; int i = 0, nz = 0;
#if CV_NEON #if CV_SSE2
if (USE_SSE2)
{
__m128i v_zero = _mm_setzero_si128 ();
static const uchar * tab = initPopcountTable();
for ( ; i <= len - 8; i += 8)
{
__m128i v_src = _mm_loadu_si128((const __m128i*)(src + i));
int val = _mm_movemask_epi8(_mm_packs_epi16(_mm_cmpeq_epi16(v_src, v_zero), v_zero));
nz += tab[val];
}
src += i;
}
#elif CV_NEON
int len0 = len & -8, blockSize1 = (1 << 15), blockSize0 = blockSize1 << 6; int len0 = len & -8, blockSize1 = (1 << 15), blockSize0 = blockSize1 << 6;
uint32x4_t v_nz = vdupq_n_u32(0u); uint32x4_t v_nz = vdupq_n_u32(0u);
uint16x8_t v_zero = vdupq_n_u16(0), v_1 = vdupq_n_u16(1); uint16x8_t v_zero = vdupq_n_u16(0), v_1 = vdupq_n_u16(1);
@ -503,7 +644,27 @@ static int countNonZero16u( const ushort* src, int len )
static int countNonZero32s( const int* src, int len ) static int countNonZero32s( const int* src, int len )
{ {
int i = 0, nz = 0; int i = 0, nz = 0;
#if CV_NEON #if CV_SSE2
if (USE_SSE2)
{
__m128i v_zero = _mm_setzero_si128 ();
static const uchar * tab = initPopcountTable();
for ( ; i <= len - 8; i += 8)
{
__m128i v_src = _mm_loadu_si128((const __m128i*)(src + i));
__m128i v_dst0 = _mm_cmpeq_epi32(v_src, v_zero);
v_src = _mm_loadu_si128((const __m128i*)(src + i + 4));
__m128i v_dst1 = _mm_cmpeq_epi32(v_src, v_zero);
int val = _mm_movemask_epi8(_mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_zero));
nz += tab[val];
}
src += i;
}
#elif CV_NEON
int len0 = len & -8, blockSize1 = (1 << 15), blockSize0 = blockSize1 << 6; int len0 = len & -8, blockSize1 = (1 << 15), blockSize0 = blockSize1 << 6;
uint32x4_t v_nz = vdupq_n_u32(0u); uint32x4_t v_nz = vdupq_n_u32(0u);
int32x4_t v_zero = vdupq_n_s32(0.0f); int32x4_t v_zero = vdupq_n_s32(0.0f);
@ -541,7 +702,25 @@ static int countNonZero32s( const int* src, int len )
static int countNonZero32f( const float* src, int len ) static int countNonZero32f( const float* src, int len )
{ {
int i = 0, nz = 0; int i = 0, nz = 0;
#if CV_NEON #if CV_SSE2
if (USE_SSE2)
{
__m128i v_zero_i = _mm_setzero_si128();
__m128 v_zero_f = _mm_setzero_ps();
static const uchar * tab = initPopcountTable();
for ( ; i <= len - 8; i += 8)
{
__m128i v_dst0 = _mm_castps_si128(_mm_cmpeq_ps(_mm_loadu_ps(src + i), v_zero_f));
__m128i v_dst1 = _mm_castps_si128(_mm_cmpeq_ps(_mm_loadu_ps(src + i + 4), v_zero_f));
int val = _mm_movemask_epi8(_mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_zero_i));
nz += tab[val];
}
src += i;
}
#elif CV_NEON
int len0 = len & -8, blockSize1 = (1 << 15), blockSize0 = blockSize1 << 6; int len0 = len & -8, blockSize1 = (1 << 15), blockSize0 = blockSize1 << 6;
uint32x4_t v_nz = vdupq_n_u32(0u); uint32x4_t v_nz = vdupq_n_u32(0u);
float32x4_t v_zero = vdupq_n_f32(0.0f); float32x4_t v_zero = vdupq_n_f32(0.0f);
@ -577,7 +756,34 @@ static int countNonZero32f( const float* src, int len )
} }
static int countNonZero64f( const double* src, int len ) static int countNonZero64f( const double* src, int len )
{ return countNonZero_(src, len); } {
int i = 0, nz = 0;
#if CV_SSE2
if (USE_SSE2)
{
__m128i v_zero_i = _mm_setzero_si128();
__m128d v_zero_d = _mm_setzero_pd();
static const uchar * tab = initPopcountTable();
for ( ; i <= len - 8; i += 8)
{
__m128i v_dst0 = _mm_castpd_si128(_mm_cmpeq_pd(_mm_loadu_pd(src + i), v_zero_d));
__m128i v_dst1 = _mm_castpd_si128(_mm_cmpeq_pd(_mm_loadu_pd(src + i + 2), v_zero_d));
__m128i v_dst2 = _mm_castpd_si128(_mm_cmpeq_pd(_mm_loadu_pd(src + i + 4), v_zero_d));
__m128i v_dst3 = _mm_castpd_si128(_mm_cmpeq_pd(_mm_loadu_pd(src + i + 6), v_zero_d));
v_dst0 = _mm_packs_epi32(v_dst0, v_dst1);
v_dst1 = _mm_packs_epi32(v_dst2, v_dst3);
int val = _mm_movemask_epi8(_mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_zero_i));
nz += tab[val];
}
src += i;
}
#endif
return nz + countNonZero_(src, len - i);
}
typedef int (*CountNonZeroFunc)(const uchar*, int); typedef int (*CountNonZeroFunc)(const uchar*, int);
@ -594,6 +800,137 @@ static CountNonZeroFunc getCountNonZeroTab(int depth)
return countNonZeroTab[depth]; return countNonZeroTab[depth];
} }
template <typename T, typename ST, typename SQT>
struct SumSqr_SIMD
{
int operator () (const T *, const uchar *, ST *, SQT *, int, int) const
{
return 0;
}
};
#if CV_SSE2
template <>
struct SumSqr_SIMD<uchar, int, int>
{
int operator () (const uchar * src0, const uchar * mask, int * sum, int * sqsum, int len, int cn) const
{
if (mask || (cn != 1 && cn != 2) || !USE_SSE2)
return 0;
int x = 0;
__m128i v_zero = _mm_setzero_si128(), v_sum = v_zero, v_sqsum = v_zero;
for ( ; x <= len - 16; x += 16)
{
__m128i v_src = _mm_loadu_si128((const __m128i *)(src0 + x));
__m128i v_half = _mm_unpacklo_epi8(v_src, v_zero);
__m128i v_mullo = _mm_mullo_epi16(v_half, v_half);
__m128i v_mulhi = _mm_mulhi_epi16(v_half, v_half);
v_sum = _mm_add_epi32(v_sum, _mm_unpacklo_epi16(v_half, v_zero));
v_sum = _mm_add_epi32(v_sum, _mm_unpackhi_epi16(v_half, v_zero));
v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpacklo_epi16(v_mullo, v_mulhi));
v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpackhi_epi16(v_mullo, v_mulhi));
v_half = _mm_unpackhi_epi8(v_src, v_zero);
v_mullo = _mm_mullo_epi16(v_half, v_half);
v_mulhi = _mm_mulhi_epi16(v_half, v_half);
v_sum = _mm_add_epi32(v_sum, _mm_unpacklo_epi16(v_half, v_zero));
v_sum = _mm_add_epi32(v_sum, _mm_unpackhi_epi16(v_half, v_zero));
v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpacklo_epi16(v_mullo, v_mulhi));
v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpackhi_epi16(v_mullo, v_mulhi));
}
for ( ; x <= len - 8; x += 8)
{
__m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src0 + x)), v_zero);
__m128i v_mullo = _mm_mullo_epi16(v_src, v_src);
__m128i v_mulhi = _mm_mulhi_epi16(v_src, v_src);
v_sum = _mm_add_epi32(v_sum, _mm_unpacklo_epi16(v_src, v_zero));
v_sum = _mm_add_epi32(v_sum, _mm_unpackhi_epi16(v_src, v_zero));
v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpacklo_epi16(v_mullo, v_mulhi));
v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpackhi_epi16(v_mullo, v_mulhi));
}
int CV_DECL_ALIGNED(16) ar[8];
_mm_store_si128((__m128i*)ar, v_sum);
_mm_store_si128((__m128i*)(ar + 4), v_sqsum);
for (int i = 0; i < 4; i += cn)
for (int j = 0; j < cn; ++j)
{
sum[j] += ar[j + i];
sqsum[j] += ar[4 + j + i];
}
return x / cn;
}
};
template <>
struct SumSqr_SIMD<schar, int, int>
{
int operator () (const schar * src0, const uchar * mask, int * sum, int * sqsum, int len, int cn) const
{
if (mask || (cn != 1 && cn != 2) || !USE_SSE2)
return 0;
int x = 0;
__m128i v_zero = _mm_setzero_si128(), v_sum = v_zero, v_sqsum = v_zero;
for ( ; x <= len - 16; x += 16)
{
__m128i v_src = _mm_loadu_si128((const __m128i *)(src0 + x));
__m128i v_half = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src), 8);
__m128i v_mullo = _mm_mullo_epi16(v_half, v_half);
__m128i v_mulhi = _mm_mulhi_epi16(v_half, v_half);
v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_half), 16));
v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_half), 16));
v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpacklo_epi16(v_mullo, v_mulhi));
v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpackhi_epi16(v_mullo, v_mulhi));
v_half = _mm_srai_epi16(_mm_unpackhi_epi8(v_zero, v_src), 8);
v_mullo = _mm_mullo_epi16(v_half, v_half);
v_mulhi = _mm_mulhi_epi16(v_half, v_half);
v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_half), 16));
v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_half), 16));
v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpacklo_epi16(v_mullo, v_mulhi));
v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpackhi_epi16(v_mullo, v_mulhi));
}
for ( ; x <= len - 8; x += 8)
{
__m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src0 + x))), 8);
__m128i v_mullo = _mm_mullo_epi16(v_src, v_src);
__m128i v_mulhi = _mm_mulhi_epi16(v_src, v_src);
v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpacklo_epi16(v_mullo, v_mulhi));
v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpackhi_epi16(v_mullo, v_mulhi));
}
int CV_DECL_ALIGNED(16) ar[8];
_mm_store_si128((__m128i*)ar, v_sum);
_mm_store_si128((__m128i*)(ar + 4), v_sqsum);
for (int i = 0; i < 4; i += cn)
for (int j = 0; j < cn; ++j)
{
sum[j] += ar[j + i];
sqsum[j] += ar[4 + j + i];
}
return x / cn;
}
};
#endif
template<typename T, typename ST, typename SQT> template<typename T, typename ST, typename SQT>
static int sumsqr_(const T* src0, const uchar* mask, ST* sum, SQT* sqsum, int len, int cn ) static int sumsqr_(const T* src0, const uchar* mask, ST* sum, SQT* sqsum, int len, int cn )
{ {
@ -601,14 +938,15 @@ static int sumsqr_(const T* src0, const uchar* mask, ST* sum, SQT* sqsum, int le
if( !mask ) if( !mask )
{ {
int i; SumSqr_SIMD<T, ST, SQT> vop;
int k = cn % 4; int i = vop(src0, mask, sum, sqsum, len, cn), k = cn % 4;
src += i * cn;
if( k == 1 ) if( k == 1 )
{ {
ST s0 = sum[0]; ST s0 = sum[0];
SQT sq0 = sqsum[0]; SQT sq0 = sqsum[0];
for( i = 0; i < len; i++, src += cn ) for( ; i < len; i++, src += cn )
{ {
T v = src[0]; T v = src[0];
s0 += v; sq0 += (SQT)v*v; s0 += v; sq0 += (SQT)v*v;
@ -620,7 +958,7 @@ static int sumsqr_(const T* src0, const uchar* mask, ST* sum, SQT* sqsum, int le
{ {
ST s0 = sum[0], s1 = sum[1]; ST s0 = sum[0], s1 = sum[1];
SQT sq0 = sqsum[0], sq1 = sqsum[1]; SQT sq0 = sqsum[0], sq1 = sqsum[1];
for( i = 0; i < len; i++, src += cn ) for( ; i < len; i++, src += cn )
{ {
T v0 = src[0], v1 = src[1]; T v0 = src[0], v1 = src[1];
s0 += v0; sq0 += (SQT)v0*v0; s0 += v0; sq0 += (SQT)v0*v0;
@ -633,7 +971,7 @@ static int sumsqr_(const T* src0, const uchar* mask, ST* sum, SQT* sqsum, int le
{ {
ST s0 = sum[0], s1 = sum[1], s2 = sum[2]; ST s0 = sum[0], s1 = sum[1], s2 = sum[2];
SQT sq0 = sqsum[0], sq1 = sqsum[1], sq2 = sqsum[2]; SQT sq0 = sqsum[0], sq1 = sqsum[1], sq2 = sqsum[2];
for( i = 0; i < len; i++, src += cn ) for( ; i < len; i++, src += cn )
{ {
T v0 = src[0], v1 = src[1], v2 = src[2]; T v0 = src[0], v1 = src[1], v2 = src[2];
s0 += v0; sq0 += (SQT)v0*v0; s0 += v0; sq0 += (SQT)v0*v0;
@ -649,7 +987,7 @@ static int sumsqr_(const T* src0, const uchar* mask, ST* sum, SQT* sqsum, int le
src = src0 + k; src = src0 + k;
ST s0 = sum[k], s1 = sum[k+1], s2 = sum[k+2], s3 = sum[k+3]; ST s0 = sum[k], s1 = sum[k+1], s2 = sum[k+2], s3 = sum[k+3];
SQT sq0 = sqsum[k], sq1 = sqsum[k+1], sq2 = sqsum[k+2], sq3 = sqsum[k+3]; SQT sq0 = sqsum[k], sq1 = sqsum[k+1], sq2 = sqsum[k+2], sq3 = sqsum[k+3];
for( i = 0; i < len; i++, src += cn ) for( ; i < len; i++, src += cn )
{ {
T v0, v1; T v0, v1;
v0 = src[0], v1 = src[1]; v0 = src[0], v1 = src[1];
@ -924,7 +1262,6 @@ cv::Scalar cv::sum( InputArray _src )
} }
} }
#endif #endif
SumFunc func = getSumFunc(depth); SumFunc func = getSumFunc(depth);
CV_Assert( cn <= 4 && func != 0 ); CV_Assert( cn <= 4 && func != 0 );

View File

@ -12,6 +12,7 @@
// //
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 2015, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners. // Third party copyrights are property of their respective owners.
// //
// Redistribution and use in source and binary forms, with or without modification, // Redistribution and use in source and binary forms, with or without modification,
@ -89,6 +90,22 @@
pop ebx pop ebx
} }
} }
static void __cpuidex(int* cpuid_data, int, int)
{
__asm
{
push edi
mov edi, cpuid_data
mov eax, 7
mov ecx, 0
cpuid
mov [edi], eax
mov [edi + 4], ebx
mov [edi + 8], ecx
mov [edi + 12], edx
pop edi
}
}
#endif #endif
#endif #endif
@ -208,7 +225,7 @@ struct HWFeatures
enum { MAX_FEATURE = CV_HARDWARE_MAX_FEATURE }; enum { MAX_FEATURE = CV_HARDWARE_MAX_FEATURE };
HWFeatures(void) HWFeatures(void)
{ {
memset( have, 0, sizeof(have) ); memset( have, 0, sizeof(have) );
x86_family = 0; x86_family = 0;
} }
@ -252,10 +269,54 @@ struct HWFeatures
f.have[CV_CPU_SSE2] = (cpuid_data[3] & (1<<26)) != 0; f.have[CV_CPU_SSE2] = (cpuid_data[3] & (1<<26)) != 0;
f.have[CV_CPU_SSE3] = (cpuid_data[2] & (1<<0)) != 0; f.have[CV_CPU_SSE3] = (cpuid_data[2] & (1<<0)) != 0;
f.have[CV_CPU_SSSE3] = (cpuid_data[2] & (1<<9)) != 0; f.have[CV_CPU_SSSE3] = (cpuid_data[2] & (1<<9)) != 0;
f.have[CV_CPU_FMA3] = (cpuid_data[2] & (1<<12)) != 0;
f.have[CV_CPU_SSE4_1] = (cpuid_data[2] & (1<<19)) != 0; f.have[CV_CPU_SSE4_1] = (cpuid_data[2] & (1<<19)) != 0;
f.have[CV_CPU_SSE4_2] = (cpuid_data[2] & (1<<20)) != 0; f.have[CV_CPU_SSE4_2] = (cpuid_data[2] & (1<<20)) != 0;
f.have[CV_CPU_POPCNT] = (cpuid_data[2] & (1<<23)) != 0; f.have[CV_CPU_POPCNT] = (cpuid_data[2] & (1<<23)) != 0;
f.have[CV_CPU_AVX] = (((cpuid_data[2] & (1<<28)) != 0)&&((cpuid_data[2] & (1<<27)) != 0));//OS uses XSAVE_XRSTORE and CPU support AVX f.have[CV_CPU_AVX] = (((cpuid_data[2] & (1<<28)) != 0)&&((cpuid_data[2] & (1<<27)) != 0));//OS uses XSAVE_XRSTORE and CPU support AVX
// make the second call to the cpuid command in order to get
// information about extended features like AVX2
#if defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
__cpuidex(cpuid_data, 7, 0);
#elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
#ifdef __x86_64__
asm __volatile__
(
"movl $7, %%eax\n\t"
"movl $0, %%ecx\n\t"
"cpuid\n\t"
:[eax]"=a"(cpuid_data[0]),[ebx]"=b"(cpuid_data[1]),[ecx]"=c"(cpuid_data[2]),[edx]"=d"(cpuid_data[3])
:
: "cc"
);
#else
asm volatile
(
"pushl %%eax\n\t"
"pushl %%edx\n\t"
"movl $7,%%eax\n\t"
"movl $0,%%ecx\n\t"
"cpuid\n\t"
"popl %%edx\n\t"
"popl %%eax\n\t"
: "=b"(cpuid_data[1]), "=c"(cpuid_data[2])
:
: "cc"
);
#endif
#endif
f.have[CV_CPU_AVX2] = (cpuid_data[1] & (1<<5)) != 0;
f.have[CV_CPU_AVX_512F] = (cpuid_data[1] & (1<<16)) != 0;
f.have[CV_CPU_AVX_512DQ] = (cpuid_data[1] & (1<<17)) != 0;
f.have[CV_CPU_AVX_512IFMA512] = (cpuid_data[1] & (1<<21)) != 0;
f.have[CV_CPU_AVX_512PF] = (cpuid_data[1] & (1<<26)) != 0;
f.have[CV_CPU_AVX_512ER] = (cpuid_data[1] & (1<<27)) != 0;
f.have[CV_CPU_AVX_512CD] = (cpuid_data[1] & (1<<28)) != 0;
f.have[CV_CPU_AVX_512BW] = (cpuid_data[1] & (1<<30)) != 0;
f.have[CV_CPU_AVX_512VL] = (cpuid_data[1] & (1<<31)) != 0;
f.have[CV_CPU_AVX_512VBMI] = (cpuid_data[2] & (1<<1)) != 0;
} }
#if defined ANDROID || defined __linux__ #if defined ANDROID || defined __linux__
@ -318,6 +379,7 @@ IPPInitializer ippInitializer;
volatile bool USE_SSE2 = featuresEnabled.have[CV_CPU_SSE2]; volatile bool USE_SSE2 = featuresEnabled.have[CV_CPU_SSE2];
volatile bool USE_SSE4_2 = featuresEnabled.have[CV_CPU_SSE4_2]; volatile bool USE_SSE4_2 = featuresEnabled.have[CV_CPU_SSE4_2];
volatile bool USE_AVX = featuresEnabled.have[CV_CPU_AVX]; volatile bool USE_AVX = featuresEnabled.have[CV_CPU_AVX];
volatile bool USE_AVX2 = featuresEnabled.have[CV_CPU_AVX2];
void setUseOptimized( bool flag ) void setUseOptimized( bool flag )
{ {

View File

@ -10,8 +10,7 @@
// License Agreement // License Agreement
// For Open Source Computer Vision Library // For Open Source Computer Vision Library
// //
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2014, Itseez Inc., all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners. // Third party copyrights are property of their respective owners.
// //
// Redistribution and use in source and binary forms, with or without modification, // Redistribution and use in source and binary forms, with or without modification,

View File

@ -1577,7 +1577,7 @@ PARAM_TEST_CASE(ConvertScaleAbs, MatDepth, Channels, bool)
Size roiSize = randomSize(1, MAX_VALUE); Size roiSize = randomSize(1, MAX_VALUE);
Border srcBorder = randomBorder(0, use_roi ? MAX_VALUE : 0); Border srcBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
randomSubMat(src, src_roi, roiSize, srcBorder, stype, 2, 11); // FIXIT: Test with minV, maxV randomSubMat(src, src_roi, roiSize, srcBorder, stype, -11, 11); // FIXIT: Test with minV, maxV
Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0); Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
randomSubMat(dst, dst_roi, roiSize, dstBorder, dtype, 5, 16); randomSubMat(dst, dst_roi, roiSize, dstBorder, dtype, 5, 16);

View File

@ -12,6 +12,7 @@
// //
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 2014, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners. // Third party copyrights are property of their respective owners.
/ /
// Redistribution and use in source and binary forms, with or without modification, // Redistribution and use in source and binary forms, with or without modification,

View File

@ -11,6 +11,7 @@
// For Open Source Computer Vision Library // For Open Source Computer Vision Library
// //
// Copyright (C) 2000, Intel Corporation, all rights reserved. // Copyright (C) 2000, Intel Corporation, all rights reserved.
// Copyright (C) 2014, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners. // Third party copyrights are property of their respective owners.
// //
// Redistribution and use in source and binary forms, with or without modification, // Redistribution and use in source and binary forms, with or without modification,

View File

@ -11,6 +11,7 @@
// For Open Source Computer Vision Library // For Open Source Computer Vision Library
// //
// Copyright (C) 2013, NVIDIA Corporation, all rights reserved. // Copyright (C) 2013, NVIDIA Corporation, all rights reserved.
// Copyright (C) 2014, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners. // Third party copyrights are property of their respective owners.
// //
// Redistribution and use in source and binary forms, with or without modification, // Redistribution and use in source and binary forms, with or without modification,

File diff suppressed because it is too large Load Diff

View File

@ -12,6 +12,7 @@
// //
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners. // Third party copyrights are property of their respective owners.
// //
// Redistribution and use in source and binary forms, with or without modification, // Redistribution and use in source and binary forms, with or without modification,
@ -270,6 +271,8 @@ cornerEigenValsVecs( const Mat& src, Mat& eigenv, int block_size,
#ifdef HAVE_TEGRA_OPTIMIZATION #ifdef HAVE_TEGRA_OPTIMIZATION
if (tegra::cornerEigenValsVecs(src, eigenv, block_size, aperture_size, op_type, k, borderType)) if (tegra::cornerEigenValsVecs(src, eigenv, block_size, aperture_size, op_type, k, borderType))
return; return;
#elif CV_SSE2
bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
#endif #endif
int depth = src.depth(); int depth = src.depth();
@ -318,6 +321,33 @@ cornerEigenValsVecs( const Mat& src, Mat& eigenv, int block_size,
vst3q_f32(cov_data + j * 3, v_dst); vst3q_f32(cov_data + j * 3, v_dst);
} }
#elif CV_SSE2
if (haveSSE2)
{
for( ; j <= size.width - 8; j += 8 )
{
__m128 v_dx_0 = _mm_loadu_ps(dxdata + j);
__m128 v_dx_1 = _mm_loadu_ps(dxdata + j + 4);
__m128 v_dy_0 = _mm_loadu_ps(dydata + j);
__m128 v_dy_1 = _mm_loadu_ps(dydata + j + 4);
__m128 v_dx2_0 = _mm_mul_ps(v_dx_0, v_dx_0);
__m128 v_dxy_0 = _mm_mul_ps(v_dx_0, v_dy_0);
__m128 v_dy2_0 = _mm_mul_ps(v_dy_0, v_dy_0);
__m128 v_dx2_1 = _mm_mul_ps(v_dx_1, v_dx_1);
__m128 v_dxy_1 = _mm_mul_ps(v_dx_1, v_dy_1);
__m128 v_dy2_1 = _mm_mul_ps(v_dy_1, v_dy_1);
_mm_interleave_ps(v_dx2_0, v_dx2_1, v_dxy_0, v_dxy_1, v_dy2_0, v_dy2_1);
_mm_storeu_ps(cov_data + j * 3, v_dx2_0);
_mm_storeu_ps(cov_data + j * 3 + 4, v_dx2_1);
_mm_storeu_ps(cov_data + j * 3 + 8, v_dxy_0);
_mm_storeu_ps(cov_data + j * 3 + 12, v_dxy_1);
_mm_storeu_ps(cov_data + j * 3 + 16, v_dy2_0);
_mm_storeu_ps(cov_data + j * 3 + 20, v_dy2_1);
}
}
#endif #endif
for( ; j < size.width; j++ ) for( ; j < size.width; j++ )

View File

@ -12,6 +12,7 @@
// //
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009-2010, Willow Garage Inc., all rights reserved. // Copyright (C) 2009-2010, Willow Garage Inc., all rights reserved.
// Copyright (C) 2014, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners. // Third party copyrights are property of their respective owners.
// //
// Redistribution and use in source and binary forms, with or without modification, // Redistribution and use in source and binary forms, with or without modification,

View File

@ -2284,15 +2284,20 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method )
CV_Assert( it.planes[0].isContinuous() && it.planes[1].isContinuous() ); CV_Assert( it.planes[0].isContinuous() && it.planes[1].isContinuous() );
#if CV_SSE2
bool haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
#endif
for( size_t i = 0; i < it.nplanes; i++, ++it ) for( size_t i = 0; i < it.nplanes; i++, ++it )
{ {
const float* h1 = it.planes[0].ptr<float>(); const float* h1 = it.planes[0].ptr<float>();
const float* h2 = it.planes[1].ptr<float>(); const float* h2 = it.planes[1].ptr<float>();
len = it.planes[0].rows*it.planes[0].cols*H1.channels(); len = it.planes[0].rows*it.planes[0].cols*H1.channels();
j = 0;
if( (method == CV_COMP_CHISQR) || (method == CV_COMP_CHISQR_ALT)) if( (method == CV_COMP_CHISQR) || (method == CV_COMP_CHISQR_ALT))
{ {
for( j = 0; j < len; j++ ) for( ; j < len; j++ )
{ {
double a = h1[j] - h2[j]; double a = h1[j] - h2[j];
double b = (method == CV_COMP_CHISQR) ? h1[j] : h1[j] + h2[j]; double b = (method == CV_COMP_CHISQR) ? h1[j] : h1[j] + h2[j];
@ -2302,7 +2307,51 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method )
} }
else if( method == CV_COMP_CORREL ) else if( method == CV_COMP_CORREL )
{ {
for( j = 0; j < len; j++ ) #if CV_SSE2
if (haveSIMD)
{
__m128d v_s1 = _mm_setzero_pd(), v_s2 = v_s1;
__m128d v_s11 = v_s1, v_s22 = v_s1, v_s12 = v_s1;
for ( ; j <= len - 4; j += 4)
{
__m128 v_a = _mm_loadu_ps(h1 + j);
__m128 v_b = _mm_loadu_ps(h2 + j);
// 0-1
__m128d v_ad = _mm_cvtps_pd(v_a);
__m128d v_bd = _mm_cvtps_pd(v_b);
v_s12 = _mm_add_pd(v_s12, _mm_mul_pd(v_ad, v_bd));
v_s11 = _mm_add_pd(v_s11, _mm_mul_pd(v_ad, v_ad));
v_s22 = _mm_add_pd(v_s22, _mm_mul_pd(v_bd, v_bd));
v_s1 = _mm_add_pd(v_s1, v_ad);
v_s2 = _mm_add_pd(v_s2, v_bd);
// 2-3
v_ad = _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_a), 8)));
v_bd = _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_b), 8)));
v_s12 = _mm_add_pd(v_s12, _mm_mul_pd(v_ad, v_bd));
v_s11 = _mm_add_pd(v_s11, _mm_mul_pd(v_ad, v_ad));
v_s22 = _mm_add_pd(v_s22, _mm_mul_pd(v_bd, v_bd));
v_s1 = _mm_add_pd(v_s1, v_ad);
v_s2 = _mm_add_pd(v_s2, v_bd);
}
double CV_DECL_ALIGNED(16) ar[10];
_mm_store_pd(ar, v_s12);
_mm_store_pd(ar + 2, v_s11);
_mm_store_pd(ar + 4, v_s22);
_mm_store_pd(ar + 6, v_s1);
_mm_store_pd(ar + 8, v_s2);
s12 += ar[0] + ar[1];
s11 += ar[2] + ar[3];
s22 += ar[4] + ar[5];
s1 += ar[6] + ar[7];
s2 += ar[8] + ar[9];
}
#endif
for( ; j < len; j++ )
{ {
double a = h1[j]; double a = h1[j];
double b = h2[j]; double b = h2[j];
@ -2316,7 +2365,6 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method )
} }
else if( method == CV_COMP_INTERSECT ) else if( method == CV_COMP_INTERSECT )
{ {
j = 0;
#if CV_NEON #if CV_NEON
float32x4_t v_result = vdupq_n_f32(0.0f); float32x4_t v_result = vdupq_n_f32(0.0f);
for( ; j <= len - 4; j += 4 ) for( ; j <= len - 4; j += 4 )
@ -2324,13 +2372,61 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method )
float CV_DECL_ALIGNED(16) ar[4]; float CV_DECL_ALIGNED(16) ar[4];
vst1q_f32(ar, v_result); vst1q_f32(ar, v_result);
result += ar[0] + ar[1] + ar[2] + ar[3]; result += ar[0] + ar[1] + ar[2] + ar[3];
#elif CV_SSE2
if (haveSIMD)
{
__m128d v_result = _mm_setzero_pd();
for ( ; j <= len - 4; j += 4)
{
__m128 v_src = _mm_min_ps(_mm_loadu_ps(h1 + j),
_mm_loadu_ps(h2 + j));
v_result = _mm_add_pd(v_result, _mm_cvtps_pd(v_src));
v_src = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8));
v_result = _mm_add_pd(v_result, _mm_cvtps_pd(v_src));
}
double CV_DECL_ALIGNED(16) ar[2];
_mm_store_pd(ar, v_result);
result += ar[0] + ar[1];
}
#endif #endif
for( ; j < len; j++ ) for( ; j < len; j++ )
result += std::min(h1[j], h2[j]); result += std::min(h1[j], h2[j]);
} }
else if( method == CV_COMP_BHATTACHARYYA ) else if( method == CV_COMP_BHATTACHARYYA )
{ {
for( j = 0; j < len; j++ ) #if CV_SSE2
if (haveSIMD)
{
__m128d v_s1 = _mm_setzero_pd(), v_s2 = v_s1, v_result = v_s1;
for ( ; j <= len - 4; j += 4)
{
__m128 v_a = _mm_loadu_ps(h1 + j);
__m128 v_b = _mm_loadu_ps(h2 + j);
__m128d v_ad = _mm_cvtps_pd(v_a);
__m128d v_bd = _mm_cvtps_pd(v_b);
v_s1 = _mm_add_pd(v_s1, v_ad);
v_s2 = _mm_add_pd(v_s2, v_bd);
v_result = _mm_add_pd(v_result, _mm_sqrt_pd(_mm_mul_pd(v_ad, v_bd)));
v_ad = _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_a), 8)));
v_bd = _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_b), 8)));
v_s1 = _mm_add_pd(v_s1, v_ad);
v_s2 = _mm_add_pd(v_s2, v_bd);
v_result = _mm_add_pd(v_result, _mm_sqrt_pd(_mm_mul_pd(v_ad, v_bd)));
}
double CV_DECL_ALIGNED(16) ar[6];
_mm_store_pd(ar, v_s1);
_mm_store_pd(ar + 2, v_s2);
_mm_store_pd(ar + 4, v_result);
s1 += ar[0] + ar[1];
s2 += ar[2] + ar[3];
result += ar[4] + ar[5];
}
#endif
for( ; j < len; j++ )
{ {
double a = h1[j]; double a = h1[j];
double b = h2[j]; double b = h2[j];
@ -2341,7 +2437,7 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method )
} }
else if( method == CV_COMP_KL_DIV ) else if( method == CV_COMP_KL_DIV )
{ {
for( j = 0; j < len; j++ ) for( ; j < len; j++ )
{ {
double p = h1[j]; double p = h1[j];
double q = h2[j]; double q = h2[j];

View File

@ -12,6 +12,7 @@
// //
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners. // Third party copyrights are property of their respective owners.
// //
// Redistribution and use in source and binary forms, with or without modification, // Redistribution and use in source and binary forms, with or without modification,
@ -1962,9 +1963,9 @@ private:
struct ResizeAreaFastVec_SIMD_32f struct ResizeAreaFastVec_SIMD_32f
{ {
ResizeAreaFastVec_SIMD_32f(int _scale_x, int _scale_y, int _cn, int _step) : ResizeAreaFastVec_SIMD_32f(int _scale_x, int _scale_y, int _cn, int _step) :
scale_x(_scale_x), scale_y(_scale_y), cn(_cn), step(_step) cn(_cn), step(_step)
{ {
fast_mode = scale_x == 2 && scale_y == 2 && (cn == 1 || cn == 3 || cn == 4); fast_mode = _scale_x == 2 && _scale_y == 2 && (cn == 1 || cn == 4);
} }
int operator() (const float * S, float * D, int w) const int operator() (const float * S, float * D, int w) const
@ -2004,7 +2005,6 @@ struct ResizeAreaFastVec_SIMD_32f
} }
private: private:
int scale_x, scale_y;
int cn; int cn;
bool fast_mode; bool fast_mode;
int step; int step;
@ -2199,8 +2199,146 @@ private:
bool use_simd; bool use_simd;
}; };
typedef ResizeAreaFastNoVec<short, short> ResizeAreaFastVec_SIMD_16s; class ResizeAreaFastVec_SIMD_16s
typedef ResizeAreaFastNoVec<float, float> ResizeAreaFastVec_SIMD_32f; {
public:
ResizeAreaFastVec_SIMD_16s(int _cn, int _step) :
cn(_cn), step(_step)
{
use_simd = checkHardwareSupport(CV_CPU_SSE2);
}
int operator() (const short* S, short* D, int w) const
{
if (!use_simd)
return 0;
int dx = 0;
const short* S0 = (const short*)S;
const short* S1 = (const short*)((const uchar*)(S) + step);
__m128i masklow = _mm_set1_epi32(0x0000ffff);
__m128i zero = _mm_setzero_si128();
__m128i delta2 = _mm_set1_epi32(2);
if (cn == 1)
{
for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
{
__m128i r0 = _mm_loadu_si128((const __m128i*)S0);
__m128i r1 = _mm_loadu_si128((const __m128i*)S1);
__m128i s0 = _mm_add_epi32(_mm_srai_epi32(r0, 16),
_mm_srai_epi32(_mm_slli_epi32(_mm_and_si128(r0, masklow), 16), 16));
__m128i s1 = _mm_add_epi32(_mm_srai_epi32(r1, 16),
_mm_srai_epi32(_mm_slli_epi32(_mm_and_si128(r1, masklow), 16), 16));
s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), delta2);
s0 = _mm_srai_epi32(s0, 2);
s0 = _mm_packs_epi32(s0, zero);
_mm_storel_epi64((__m128i*)D, s0);
}
}
else if (cn == 3)
for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3)
{
__m128i r0 = _mm_loadu_si128((const __m128i*)S0);
__m128i r1 = _mm_loadu_si128((const __m128i*)S1);
__m128i r0_16l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r0), 16);
__m128i r0_16h = _mm_srai_epi32(_mm_unpacklo_epi16(zero, _mm_srli_si128(r0, 6)), 16);
__m128i r1_16l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r1), 16);
__m128i r1_16h = _mm_srai_epi32(_mm_unpacklo_epi16(zero, _mm_srli_si128(r1, 6)), 16);
__m128i s0 = _mm_add_epi32(r0_16l, r0_16h);
__m128i s1 = _mm_add_epi32(r1_16l, r1_16h);
s0 = _mm_add_epi32(delta2, _mm_add_epi32(s0, s1));
s0 = _mm_packs_epi32(_mm_srai_epi32(s0, 2), zero);
_mm_storel_epi64((__m128i*)D, s0);
}
else
{
CV_Assert(cn == 4);
for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
{
__m128i r0 = _mm_loadu_si128((const __m128i*)S0);
__m128i r1 = _mm_loadu_si128((const __m128i*)S1);
__m128i r0_32l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r0), 16);
__m128i r0_32h = _mm_srai_epi32(_mm_unpackhi_epi16(zero, r0), 16);
__m128i r1_32l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r1), 16);
__m128i r1_32h = _mm_srai_epi32(_mm_unpackhi_epi16(zero, r1), 16);
__m128i s0 = _mm_add_epi32(r0_32l, r0_32h);
__m128i s1 = _mm_add_epi32(r1_32l, r1_32h);
s0 = _mm_add_epi32(s1, _mm_add_epi32(s0, delta2));
s0 = _mm_packs_epi32(_mm_srai_epi32(s0, 2), zero);
_mm_storel_epi64((__m128i*)D, s0);
}
}
return dx;
}
private:
int cn;
int step;
bool use_simd;
};
struct ResizeAreaFastVec_SIMD_32f
{
ResizeAreaFastVec_SIMD_32f(int _scale_x, int _scale_y, int _cn, int _step) :
cn(_cn), step(_step)
{
fast_mode = _scale_x == 2 && _scale_y == 2 && (cn == 1 || cn == 4);
fast_mode = fast_mode && checkHardwareSupport(CV_CPU_SSE2);
}
int operator() (const float * S, float * D, int w) const
{
if (!fast_mode)
return 0;
const float * S0 = S, * S1 = (const float *)((const uchar *)(S0) + step);
int dx = 0;
__m128 v_025 = _mm_set1_ps(0.25f);
if (cn == 1)
{
const int shuffle_lo = _MM_SHUFFLE(2, 0, 2, 0), shuffle_hi = _MM_SHUFFLE(3, 1, 3, 1);
for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
{
__m128 v_row00 = _mm_loadu_ps(S0), v_row01 = _mm_loadu_ps(S0 + 4),
v_row10 = _mm_loadu_ps(S1), v_row11 = _mm_loadu_ps(S1 + 4);
__m128 v_dst0 = _mm_add_ps(_mm_shuffle_ps(v_row00, v_row01, shuffle_lo),
_mm_shuffle_ps(v_row00, v_row01, shuffle_hi));
__m128 v_dst1 = _mm_add_ps(_mm_shuffle_ps(v_row10, v_row11, shuffle_lo),
_mm_shuffle_ps(v_row10, v_row11, shuffle_hi));
_mm_storeu_ps(D, _mm_mul_ps(_mm_add_ps(v_dst0, v_dst1), v_025));
}
}
else if (cn == 4)
{
for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
{
__m128 v_dst0 = _mm_add_ps(_mm_loadu_ps(S0), _mm_loadu_ps(S0 + 4));
__m128 v_dst1 = _mm_add_ps(_mm_loadu_ps(S1), _mm_loadu_ps(S1 + 4));
_mm_storeu_ps(D, _mm_mul_ps(_mm_add_ps(v_dst0, v_dst1), v_025));
}
}
return dx;
}
private:
int cn;
bool fast_mode;
int step;
};
#else #else
@ -4678,6 +4816,13 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
size.height = 1; size.height = 1;
} }
#if CV_SSE2
bool useSSE2 = checkHardwareSupport(CV_CPU_SSE2);
#endif
#if CV_SSE4_1
bool useSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1);
#endif
const float scale = 1.f/INTER_TAB_SIZE; const float scale = 1.f/INTER_TAB_SIZE;
int x, y; int x, y;
for( y = 0; y < size.height; y++ ) for( y = 0; y < size.height; y++ )
@ -4708,6 +4853,29 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
vst2q_s16(dst1 + (x << 1), v_dst); vst2q_s16(dst1 + (x << 1), v_dst);
} }
#elif CV_SSE4_1
if (useSSE4_1)
{
for( ; x <= size.width - 16; x += 16 )
{
__m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x)),
_mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 4)));
__m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 8)),
_mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 12)));
__m128i v_dst2 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src2f + x)),
_mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 4)));
__m128i v_dst3 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 8)),
_mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 12)));
_mm_interleave_epi16(v_dst0, v_dst1, v_dst2, v_dst3);
_mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst0);
_mm_storeu_si128((__m128i *)(dst1 + x * 2 + 8), v_dst1);
_mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst2);
_mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst3);
}
}
#endif #endif
for( ; x < size.width; x++ ) for( ; x < size.width; x++ )
{ {
@ -4742,6 +4910,52 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
vandq_s32(v_ix1, v_mask))); vandq_s32(v_ix1, v_mask)));
vst1q_u16(dst2 + x, vcombine_u16(v_dst0, v_dst1)); vst1q_u16(dst2 + x, vcombine_u16(v_dst0, v_dst1));
} }
#elif CV_SSE4_1
if (useSSE4_1)
{
__m128 v_its = _mm_set1_ps(INTER_TAB_SIZE);
__m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE-1);
for( ; x <= size.width - 16; x += 16 )
{
__m128i v_ix0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x), v_its));
__m128i v_ix1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 4), v_its));
__m128i v_iy0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x), v_its));
__m128i v_iy1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 4), v_its));
__m128i v_dst10 = _mm_packs_epi32(_mm_srai_epi32(v_ix0, INTER_BITS),
_mm_srai_epi32(v_ix1, INTER_BITS));
__m128i v_dst12 = _mm_packs_epi32(_mm_srai_epi32(v_iy0, INTER_BITS),
_mm_srai_epi32(v_iy1, INTER_BITS));
__m128i v_dst20 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy0, v_its1), INTER_BITS),
_mm_and_si128(v_ix0, v_its1));
__m128i v_dst21 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy1, v_its1), INTER_BITS),
_mm_and_si128(v_ix1, v_its1));
_mm_storeu_si128((__m128i *)(dst2 + x), _mm_packus_epi32(v_dst20, v_dst21));
v_ix0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 8), v_its));
v_ix1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 12), v_its));
v_iy0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 8), v_its));
v_iy1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 12), v_its));
__m128i v_dst11 = _mm_packs_epi32(_mm_srai_epi32(v_ix0, INTER_BITS),
_mm_srai_epi32(v_ix1, INTER_BITS));
__m128i v_dst13 = _mm_packs_epi32(_mm_srai_epi32(v_iy0, INTER_BITS),
_mm_srai_epi32(v_iy1, INTER_BITS));
v_dst20 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy0, v_its1), INTER_BITS),
_mm_and_si128(v_ix0, v_its1));
v_dst21 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy1, v_its1), INTER_BITS),
_mm_and_si128(v_ix1, v_its1));
_mm_storeu_si128((__m128i *)(dst2 + x + 8), _mm_packus_epi32(v_dst20, v_dst21));
_mm_interleave_epi16(v_dst10, v_dst11, v_dst12, v_dst13);
_mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst10);
_mm_storeu_si128((__m128i *)(dst1 + x * 2 + 8), v_dst11);
_mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst12);
_mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst13);
}
}
#endif #endif
for( ; x < size.width; x++ ) for( ; x < size.width; x++ )
{ {
@ -4761,6 +4975,12 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
for( ; x <= (size.width << 1) - 8; x += 8 ) for( ; x <= (size.width << 1) - 8; x += 8 )
vst1q_s16(dst1 + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x))), vst1q_s16(dst1 + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x))),
vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x + 4))))); vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x + 4)))));
#elif CV_SSE2
for( ; x <= (size.width << 1) - 8; x += 8 )
{
_mm_storeu_si128((__m128i *)(dst1 + x), _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x)),
_mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 4))));
}
#endif #endif
for( ; x < size.width; x++ ) for( ; x < size.width; x++ )
{ {
@ -4796,6 +5016,30 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
vandq_s32(v_ix1, v_mask))); vandq_s32(v_ix1, v_mask)));
vst1q_u16(dst2 + x, vcombine_u16(v_dst0, v_dst1)); vst1q_u16(dst2 + x, vcombine_u16(v_dst0, v_dst1));
} }
#elif CV_SSE4_1
if (useSSE4_1)
{
__m128 v_its = _mm_set1_ps(INTER_TAB_SIZE);
__m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE-1);
__m128i v_y_mask = _mm_set1_epi32((INTER_TAB_SIZE-1) << 16);
for( ; x <= size.width - 4; x += 4 )
{
__m128i v_src0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x * 2), v_its));
__m128i v_src1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x * 2 + 4), v_its));
__m128i v_dst1 = _mm_packs_epi32(_mm_srai_epi32(v_src0, INTER_BITS),
_mm_srai_epi32(v_src1, INTER_BITS));
_mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst1);
// x0 y0 x1 y1 . . .
v_src0 = _mm_packs_epi32(_mm_and_si128(v_src0, v_its1),
_mm_and_si128(v_src1, v_its1));
__m128i v_dst2 = _mm_or_si128(_mm_srli_epi32(_mm_and_si128(v_src0, v_y_mask), 16 - INTER_BITS), // y0 0 y1 0 . . .
_mm_and_si128(v_src0, v_its1)); // 0 x0 0 x1 . . .
_mm_storel_epi64((__m128i *)(dst2 + x), _mm_packus_epi32(v_dst2, v_dst2));
}
}
#endif #endif
for( ; x < size.width; x++ ) for( ; x < size.width; x++ )
{ {
@ -4841,6 +5085,44 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
vst1q_f32(dst1f + x + 4, v_dst1); vst1q_f32(dst1f + x + 4, v_dst1);
vst1q_f32(dst2f + x + 4, v_dst2); vst1q_f32(dst2f + x + 4, v_dst2);
} }
#elif CV_SSE2
__m128i v_mask2 = _mm_set1_epi16(INTER_TAB_SIZE2-1);
__m128i v_zero = _mm_setzero_si128(), v_mask = _mm_set1_epi32(INTER_TAB_SIZE-1);
__m128 v_scale = _mm_set1_ps(scale);
for( ; x <= size.width - 16; x += 16)
{
__m128i v_src10 = _mm_loadu_si128((__m128i const *)(src1 + x * 2));
__m128i v_src11 = _mm_loadu_si128((__m128i const *)(src1 + x * 2 + 8));
__m128i v_src20 = _mm_loadu_si128((__m128i const *)(src1 + x * 2 + 16));
__m128i v_src21 = _mm_loadu_si128((__m128i const *)(src1 + x * 2 + 24));
_mm_deinterleave_epi16(v_src10, v_src11, v_src20, v_src21);
__m128i v_fxy = src2 ? _mm_and_si128(_mm_loadu_si128((__m128i const *)(src2 + x)), v_mask2) : v_zero;
__m128i v_fxy_p = _mm_unpacklo_epi16(v_fxy, v_zero);
_mm_storeu_ps(dst1f + x, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src10), 16)),
_mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask)))));
_mm_storeu_ps(dst2f + x, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src20), 16)),
_mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS)))));
v_fxy_p = _mm_unpackhi_epi16(v_fxy, v_zero);
_mm_storeu_ps(dst1f + x + 4, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src10), 16)),
_mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask)))));
_mm_storeu_ps(dst2f + x + 4, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src20), 16)),
_mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS)))));
v_fxy = src2 ? _mm_and_si128(_mm_loadu_si128((__m128i const *)(src2 + x + 8)), v_mask2) : v_zero;
v_fxy_p = _mm_unpackhi_epi16(v_fxy, v_zero);
_mm_storeu_ps(dst1f + x + 8, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src11), 16)),
_mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask)))));
_mm_storeu_ps(dst2f + x + 8, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src21), 16)),
_mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS)))));
v_fxy_p = _mm_unpackhi_epi16(v_fxy, v_zero);
_mm_storeu_ps(dst1f + x + 12, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src11), 16)),
_mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask)))));
_mm_storeu_ps(dst2f + x + 12, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src21), 16)),
_mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS)))));
}
#endif #endif
for( ; x < size.width; x++ ) for( ; x < size.width; x++ )
{ {
@ -4882,6 +5164,27 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
v_scale, vcvtq_f32_s32(vshrq_n_s32(v_fxy2, INTER_BITS))); v_scale, vcvtq_f32_s32(vshrq_n_s32(v_fxy2, INTER_BITS)));
vst2q_f32(dst1f + (x << 1) + 8, v_dst); vst2q_f32(dst1f + (x << 1) + 8, v_dst);
} }
#elif CV_SSE2
if (useSSE2)
{
__m128i v_mask2 = _mm_set1_epi16(INTER_TAB_SIZE2-1);
__m128i v_zero = _mm_set1_epi32(0), v_mask = _mm_set1_epi32(INTER_TAB_SIZE-1);
__m128 v_scale = _mm_set1_ps(scale);
for ( ; x <= size.width - 8; x += 8)
{
__m128i v_src = _mm_loadu_si128((__m128i const *)(src1 + x * 2));
__m128i v_fxy = src2 ? _mm_and_si128(_mm_loadu_si128((__m128i const *)(src2 + x)), v_mask2) : v_zero;
__m128i v_fxy1 = _mm_and_si128(v_fxy, v_mask);
__m128i v_fxy2 = _mm_srli_epi16(v_fxy, INTER_BITS);
__m128 v_add = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_fxy1, v_fxy2)), v_scale);
_mm_storeu_ps(dst1f + x * 2, _mm_add_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)), v_add));
v_add = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_fxy1, v_fxy2)), v_scale);
_mm_storeu_ps(dst1f + x * 2, _mm_add_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)), v_add));
}
}
#endif #endif
for( ; x < size.width; x++ ) for( ; x < size.width; x++ )
{ {
@ -4919,7 +5222,10 @@ public:
const int AB_SCALE = 1 << AB_BITS; const int AB_SCALE = 1 << AB_BITS;
int round_delta = interpolation == INTER_NEAREST ? AB_SCALE/2 : AB_SCALE/INTER_TAB_SIZE/2, x, y, x1, y1; int round_delta = interpolation == INTER_NEAREST ? AB_SCALE/2 : AB_SCALE/INTER_TAB_SIZE/2, x, y, x1, y1;
#if CV_SSE2 #if CV_SSE2
bool useSIMD = checkHardwareSupport(CV_CPU_SSE2); bool useSSE2 = checkHardwareSupport(CV_CPU_SSE2);
#endif
#if CV_SSE4_1
bool useSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1);
#endif #endif
int bh0 = std::min(BLOCK_SZ/2, dst.rows); int bh0 = std::min(BLOCK_SZ/2, dst.rows);
@ -4957,6 +5263,31 @@ public:
vst2q_s16(xy + (x1 << 1), v_dst); vst2q_s16(xy + (x1 << 1), v_dst);
} }
#elif CV_SSE4_1
if (useSSE4_1)
{
__m128i v_X0 = _mm_set1_epi32(X0);
__m128i v_Y0 = _mm_set1_epi32(Y0);
for ( ; x1 <= bw - 16; x1 += 16)
{
__m128i v_x0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1))), AB_BITS),
_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 4))), AB_BITS));
__m128i v_x1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 8))), AB_BITS),
_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 12))), AB_BITS));
__m128i v_y0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1))), AB_BITS),
_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 4))), AB_BITS));
__m128i v_y1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 8))), AB_BITS),
_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 12))), AB_BITS));
_mm_interleave_epi16(v_x0, v_x1, v_y0, v_y1);
_mm_storeu_si128((__m128i *)(xy + x1 * 2), v_x0);
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_x1);
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_y0);
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_y1);
}
}
#endif #endif
for( ; x1 < bw; x1++ ) for( ; x1 < bw; x1++ )
{ {
@ -4971,7 +5302,7 @@ public:
short* alpha = A + y1*bw; short* alpha = A + y1*bw;
x1 = 0; x1 = 0;
#if CV_SSE2 #if CV_SSE2
if( useSIMD ) if( useSSE2 )
{ {
__m128i fxy_mask = _mm_set1_epi32(INTER_TAB_SIZE - 1); __m128i fxy_mask = _mm_set1_epi32(INTER_TAB_SIZE - 1);
__m128i XX = _mm_set1_epi32(X0), YY = _mm_set1_epi32(Y0); __m128i XX = _mm_set1_epi32(X0), YY = _mm_set1_epi32(Y0);
@ -5364,6 +5695,20 @@ public:
int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, width); int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, width);
bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, height); bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, height);
#if CV_SSE4_1
bool haveSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1);
__m128d v_M0 = _mm_set1_pd(M[0]);
__m128d v_M3 = _mm_set1_pd(M[3]);
__m128d v_M6 = _mm_set1_pd(M[6]);
__m128d v_intmax = _mm_set1_pd((double)INT_MAX);
__m128d v_intmin = _mm_set1_pd((double)INT_MIN);
__m128d v_2 = _mm_set1_pd(2),
v_zero = _mm_setzero_pd(),
v_1 = _mm_set1_pd(1),
v_its = _mm_set1_pd(INTER_TAB_SIZE);
__m128i v_itsi1 = _mm_set1_epi32(INTER_TAB_SIZE - 1);
#endif
for( y = range.start; y < range.end; y += bh0 ) for( y = range.start; y < range.end; y += bh0 )
{ {
for( x = 0; x < width; x += bw0 ) for( x = 0; x < width; x += bw0 )
@ -5382,7 +5727,120 @@ public:
double W0 = M[6]*x + M[7]*(y + y1) + M[8]; double W0 = M[6]*x + M[7]*(y + y1) + M[8];
if( interpolation == INTER_NEAREST ) if( interpolation == INTER_NEAREST )
for( x1 = 0; x1 < bw; x1++ ) {
x1 = 0;
#if CV_SSE4_1
if (haveSSE4_1)
{
__m128d v_X0d = _mm_set1_pd(X0);
__m128d v_Y0d = _mm_set1_pd(Y0);
__m128d v_W0 = _mm_set1_pd(W0);
__m128d v_x1 = _mm_set_pd(1, 0);
for( ; x1 <= bw - 16; x1 += 16 )
{
// 0-3
__m128i v_X0, v_Y0;
{
__m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
__m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
__m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
v_x1 = _mm_add_pd(v_x1, v_2);
v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
__m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
__m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
v_x1 = _mm_add_pd(v_x1, v_2);
v_X0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
v_Y0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
}
// 4-8
__m128i v_X1, v_Y1;
{
__m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
__m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
__m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
v_x1 = _mm_add_pd(v_x1, v_2);
v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
__m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
__m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
v_x1 = _mm_add_pd(v_x1, v_2);
v_X1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
v_Y1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
}
// 8-11
__m128i v_X2, v_Y2;
{
__m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
__m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
__m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
v_x1 = _mm_add_pd(v_x1, v_2);
v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
__m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
__m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
v_x1 = _mm_add_pd(v_x1, v_2);
v_X2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
v_Y2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
}
// 12-15
__m128i v_X3, v_Y3;
{
__m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
__m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
__m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
v_x1 = _mm_add_pd(v_x1, v_2);
v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
__m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
__m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
v_x1 = _mm_add_pd(v_x1, v_2);
v_X3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
v_Y3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
}
// convert to 16s
v_X0 = _mm_packs_epi32(v_X0, v_X1);
v_X1 = _mm_packs_epi32(v_X2, v_X3);
v_Y0 = _mm_packs_epi32(v_Y0, v_Y1);
v_Y1 = _mm_packs_epi32(v_Y2, v_Y3);
_mm_interleave_epi16(v_X0, v_X1, v_Y0, v_Y1);
_mm_storeu_si128((__m128i *)(xy + x1 * 2), v_X0);
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_X1);
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0);
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1);
}
}
#endif
for( ; x1 < bw; x1++ )
{ {
double W = W0 + M[6]*x1; double W = W0 + M[6]*x1;
W = W ? 1./W : 0; W = W ? 1./W : 0;
@ -5394,10 +5852,136 @@ public:
xy[x1*2] = saturate_cast<short>(X); xy[x1*2] = saturate_cast<short>(X);
xy[x1*2+1] = saturate_cast<short>(Y); xy[x1*2+1] = saturate_cast<short>(Y);
} }
}
else else
{ {
short* alpha = A + y1*bw; short* alpha = A + y1*bw;
for( x1 = 0; x1 < bw; x1++ ) x1 = 0;
#if CV_SSE4_1
if (haveSSE4_1)
{
__m128d v_X0d = _mm_set1_pd(X0);
__m128d v_Y0d = _mm_set1_pd(Y0);
__m128d v_W0 = _mm_set1_pd(W0);
__m128d v_x1 = _mm_set_pd(1, 0);
for( ; x1 <= bw - 16; x1 += 16 )
{
// 0-3
__m128i v_X0, v_Y0;
{
__m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
__m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
__m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
v_x1 = _mm_add_pd(v_x1, v_2);
v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
__m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
__m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
v_x1 = _mm_add_pd(v_x1, v_2);
v_X0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
v_Y0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
}
// 4-8
__m128i v_X1, v_Y1;
{
__m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
__m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
__m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
v_x1 = _mm_add_pd(v_x1, v_2);
v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
__m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
__m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
v_x1 = _mm_add_pd(v_x1, v_2);
v_X1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
v_Y1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
}
// 8-11
__m128i v_X2, v_Y2;
{
__m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
__m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
__m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
v_x1 = _mm_add_pd(v_x1, v_2);
v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
__m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
__m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
v_x1 = _mm_add_pd(v_x1, v_2);
v_X2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
v_Y2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
}
// 12-15
__m128i v_X3, v_Y3;
{
__m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
__m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
__m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
v_x1 = _mm_add_pd(v_x1, v_2);
v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
__m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
__m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
v_x1 = _mm_add_pd(v_x1, v_2);
v_X3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
v_Y3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
}
// store alpha
__m128i v_alpha0 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y0, v_itsi1), INTER_BITS),
_mm_and_si128(v_X0, v_itsi1));
__m128i v_alpha1 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y1, v_itsi1), INTER_BITS),
_mm_and_si128(v_X1, v_itsi1));
_mm_storeu_si128((__m128i *)(alpha + x1), _mm_packs_epi32(v_alpha0, v_alpha1));
v_alpha0 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y2, v_itsi1), INTER_BITS),
_mm_and_si128(v_X2, v_itsi1));
v_alpha1 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y3, v_itsi1), INTER_BITS),
_mm_and_si128(v_X3, v_itsi1));
_mm_storeu_si128((__m128i *)(alpha + x1 + 8), _mm_packs_epi32(v_alpha0, v_alpha1));
// convert to 16s
v_X0 = _mm_packs_epi32(_mm_srai_epi32(v_X0, INTER_BITS), _mm_srai_epi32(v_X1, INTER_BITS));
v_X1 = _mm_packs_epi32(_mm_srai_epi32(v_X2, INTER_BITS), _mm_srai_epi32(v_X3, INTER_BITS));
v_Y0 = _mm_packs_epi32(_mm_srai_epi32(v_Y0, INTER_BITS), _mm_srai_epi32(v_Y1, INTER_BITS));
v_Y1 = _mm_packs_epi32(_mm_srai_epi32(v_Y2, INTER_BITS), _mm_srai_epi32(v_Y3, INTER_BITS));
_mm_interleave_epi16(v_X0, v_X1, v_Y0, v_Y1);
_mm_storeu_si128((__m128i *)(xy + x1 * 2), v_X0);
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_X1);
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0);
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1);
}
}
#endif
for( ; x1 < bw; x1++ )
{ {
double W = W0 + M[6]*x1; double W = W0 + M[6]*x1;
W = W ? INTER_TAB_SIZE/W : 0; W = W ? INTER_TAB_SIZE/W : 0;

View File

@ -12,6 +12,7 @@
// //
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners. // Third party copyrights are property of their respective owners.
// //
// Redistribution and use in source and binary forms, with or without modification, // Redistribution and use in source and binary forms, with or without modification,
@ -183,13 +184,336 @@ struct PyrDownVec_32f
} }
}; };
typedef PyrDownNoVec<int, ushort> PyrDownVec_32s16u; #if CV_SSE4_1
typedef PyrDownNoVec<int, short> PyrDownVec_32s16s;
struct PyrDownVec_32s16u
{
PyrDownVec_32s16u()
{
haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
}
int operator()(int** src, ushort* dst, int, int width) const
{
int x = 0;
if (!haveSSE)
return x;
const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
__m128i v_delta = _mm_set1_epi32(128);
for( ; x <= width - 8; x += 8 )
{
__m128i v_r00 = _mm_loadu_si128((__m128i const *)(row0 + x)),
v_r01 = _mm_loadu_si128((__m128i const *)(row0 + x + 4));
__m128i v_r10 = _mm_loadu_si128((__m128i const *)(row1 + x)),
v_r11 = _mm_loadu_si128((__m128i const *)(row1 + x + 4));
__m128i v_r20 = _mm_loadu_si128((__m128i const *)(row2 + x)),
v_r21 = _mm_loadu_si128((__m128i const *)(row2 + x + 4));
__m128i v_r30 = _mm_loadu_si128((__m128i const *)(row3 + x)),
v_r31 = _mm_loadu_si128((__m128i const *)(row3 + x + 4));
__m128i v_r40 = _mm_loadu_si128((__m128i const *)(row4 + x)),
v_r41 = _mm_loadu_si128((__m128i const *)(row4 + x + 4));
v_r00 = _mm_add_epi32(_mm_add_epi32(v_r00, v_r40), _mm_add_epi32(v_r20, v_r20));
v_r10 = _mm_add_epi32(_mm_add_epi32(v_r10, v_r20), v_r30);
v_r10 = _mm_slli_epi32(v_r10, 2);
__m128i v_dst0 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(v_r00, v_r10), v_delta), 8);
v_r01 = _mm_add_epi32(_mm_add_epi32(v_r01, v_r41), _mm_add_epi32(v_r21, v_r21));
v_r11 = _mm_add_epi32(_mm_add_epi32(v_r11, v_r21), v_r31);
v_r11 = _mm_slli_epi32(v_r11, 2);
__m128i v_dst1 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(v_r01, v_r11), v_delta), 8);
_mm_storeu_si128((__m128i *)(dst + x), _mm_packus_epi32(v_dst0, v_dst1));
}
return x;
}
bool haveSSE;
};
#else
typedef PyrDownNoVec<int, ushort> PyrDownVec_32s16u;
#endif // CV_SSE4_1
struct PyrDownVec_32s16s
{
PyrDownVec_32s16s()
{
haveSSE = checkHardwareSupport(CV_CPU_SSE2);
}
int operator()(int** src, short* dst, int, int width) const
{
int x = 0;
if (!haveSSE)
return x;
const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
__m128i v_delta = _mm_set1_epi32(128);
for( ; x <= width - 8; x += 8 )
{
__m128i v_r00 = _mm_loadu_si128((__m128i const *)(row0 + x)),
v_r01 = _mm_loadu_si128((__m128i const *)(row0 + x + 4));
__m128i v_r10 = _mm_loadu_si128((__m128i const *)(row1 + x)),
v_r11 = _mm_loadu_si128((__m128i const *)(row1 + x + 4));
__m128i v_r20 = _mm_loadu_si128((__m128i const *)(row2 + x)),
v_r21 = _mm_loadu_si128((__m128i const *)(row2 + x + 4));
__m128i v_r30 = _mm_loadu_si128((__m128i const *)(row3 + x)),
v_r31 = _mm_loadu_si128((__m128i const *)(row3 + x + 4));
__m128i v_r40 = _mm_loadu_si128((__m128i const *)(row4 + x)),
v_r41 = _mm_loadu_si128((__m128i const *)(row4 + x + 4));
v_r00 = _mm_add_epi32(_mm_add_epi32(v_r00, v_r40), _mm_add_epi32(v_r20, v_r20));
v_r10 = _mm_add_epi32(_mm_add_epi32(v_r10, v_r20), v_r30);
v_r10 = _mm_slli_epi32(v_r10, 2);
__m128i v_dst0 = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(v_r00, v_r10), v_delta), 8);
v_r01 = _mm_add_epi32(_mm_add_epi32(v_r01, v_r41), _mm_add_epi32(v_r21, v_r21));
v_r11 = _mm_add_epi32(_mm_add_epi32(v_r11, v_r21), v_r31);
v_r11 = _mm_slli_epi32(v_r11, 2);
__m128i v_dst1 = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(v_r01, v_r11), v_delta), 8);
_mm_storeu_si128((__m128i *)(dst + x), _mm_packs_epi32(v_dst0, v_dst1));
}
return x;
}
bool haveSSE;
};
struct PyrUpVec_32s8u
{
int operator()(int** src, uchar** dst, int, int width) const
{
int x = 0;
if (!checkHardwareSupport(CV_CPU_SSE2))
return x;
uchar *dst0 = dst[0], *dst1 = dst[1];
const uint *row0 = (uint *)src[0], *row1 = (uint *)src[1], *row2 = (uint *)src[2];
__m128i v_delta = _mm_set1_epi16(32), v_zero = _mm_setzero_si128();
for( ; x <= width - 16; x += 16 )
{
__m128i v_r0 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row0 + x)),
_mm_loadu_si128((__m128i const *)(row0 + x + 4)));
__m128i v_r1 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row1 + x)),
_mm_loadu_si128((__m128i const *)(row1 + x + 4)));
__m128i v_r2 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row2 + x)),
_mm_loadu_si128((__m128i const *)(row2 + x + 4)));
__m128i v_2r1 = _mm_adds_epu16(v_r1, v_r1), v_4r1 = _mm_adds_epu16(v_2r1, v_2r1);
__m128i v_dst00 = _mm_adds_epu16(_mm_adds_epu16(v_r0, v_r2), _mm_adds_epu16(v_2r1, v_4r1));
__m128i v_dst10 = _mm_slli_epi16(_mm_adds_epu16(v_r1, v_r2), 2);
v_r0 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row0 + x + 8)),
_mm_loadu_si128((__m128i const *)(row0 + x + 12)));
v_r1 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row1 + x + 8)),
_mm_loadu_si128((__m128i const *)(row1 + x + 12)));
v_r2 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row2 + x + 8)),
_mm_loadu_si128((__m128i const *)(row2 + x + 12)));
v_2r1 = _mm_adds_epu16(v_r1, v_r1), v_4r1 = _mm_adds_epu16(v_2r1, v_2r1);
__m128i v_dst01 = _mm_adds_epu16(_mm_adds_epu16(v_r0, v_r2), _mm_adds_epu16(v_2r1, v_4r1));
__m128i v_dst11 = _mm_slli_epi16(_mm_adds_epu16(v_r1, v_r2), 2);
_mm_storeu_si128((__m128i *)(dst0 + x), _mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(v_dst00, v_delta), 6),
_mm_srli_epi16(_mm_adds_epu16(v_dst01, v_delta), 6)));
_mm_storeu_si128((__m128i *)(dst1 + x), _mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(v_dst10, v_delta), 6),
_mm_srli_epi16(_mm_adds_epu16(v_dst11, v_delta), 6)));
}
for( ; x <= width - 8; x += 8 )
{
__m128i v_r0 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row0 + x)),
_mm_loadu_si128((__m128i const *)(row0 + x + 4)));
__m128i v_r1 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row1 + x)),
_mm_loadu_si128((__m128i const *)(row1 + x + 4)));
__m128i v_r2 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row2 + x)),
_mm_loadu_si128((__m128i const *)(row2 + x + 4)));
__m128i v_2r1 = _mm_adds_epu16(v_r1, v_r1), v_4r1 = _mm_adds_epu16(v_2r1, v_2r1);
__m128i v_dst0 = _mm_adds_epu16(_mm_adds_epu16(v_r0, v_r2), _mm_adds_epu16(v_2r1, v_4r1));
__m128i v_dst1 = _mm_slli_epi16(_mm_adds_epu16(v_r1, v_r2), 2);
_mm_storel_epi64((__m128i *)(dst0 + x), _mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(v_dst0, v_delta), 6), v_zero));
_mm_storel_epi64((__m128i *)(dst1 + x), _mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(v_dst1, v_delta), 6), v_zero));
}
return x;
}
};
struct PyrUpVec_32s16s
{
int operator()(int** src, short** dst, int, int width) const
{
int x = 0;
if (!checkHardwareSupport(CV_CPU_SSE2))
return x;
short *dst0 = dst[0], *dst1 = dst[1];
const uint *row0 = (uint *)src[0], *row1 = (uint *)src[1], *row2 = (uint *)src[2];
__m128i v_delta = _mm_set1_epi32(32), v_zero = _mm_setzero_si128();
for( ; x <= width - 8; x += 8 )
{
__m128i v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x)),
v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x)),
v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x));
__m128i v_2r1 = _mm_slli_epi32(v_r1, 1), v_4r1 = _mm_slli_epi32(v_r1, 2);
__m128i v_dst00 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1));
__m128i v_dst10 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2);
v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x + 4));
v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x + 4));
v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x + 4));
v_2r1 = _mm_slli_epi32(v_r1, 1);
v_4r1 = _mm_slli_epi32(v_r1, 2);
__m128i v_dst01 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1));
__m128i v_dst11 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2);
_mm_storeu_si128((__m128i *)(dst0 + x),
_mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_dst00, v_delta), 6),
_mm_srai_epi32(_mm_add_epi32(v_dst01, v_delta), 6)));
_mm_storeu_si128((__m128i *)(dst1 + x),
_mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_dst10, v_delta), 6),
_mm_srai_epi32(_mm_add_epi32(v_dst11, v_delta), 6)));
}
for( ; x <= width - 4; x += 4 )
{
__m128i v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x)),
v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x)),
v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x));
__m128i v_2r1 = _mm_slli_epi32(v_r1, 1), v_4r1 = _mm_slli_epi32(v_r1, 2);
__m128i v_dst0 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1));
__m128i v_dst1 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2);
_mm_storel_epi64((__m128i *)(dst0 + x),
_mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_dst0, v_delta), 6), v_zero));
_mm_storel_epi64((__m128i *)(dst1 + x),
_mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_dst1, v_delta), 6), v_zero));
}
return x;
}
};
#if CV_SSE4_1
struct PyrUpVec_32s16u
{
int operator()(int** src, ushort** dst, int, int width) const
{
int x = 0;
if (!checkHardwareSupport(CV_CPU_SSE4_1))
return x;
ushort *dst0 = dst[0], *dst1 = dst[1];
const uint *row0 = (uint *)src[0], *row1 = (uint *)src[1], *row2 = (uint *)src[2];
__m128i v_delta = _mm_set1_epi32(32), v_zero = _mm_setzero_si128();
for( ; x <= width - 8; x += 8 )
{
__m128i v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x)),
v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x)),
v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x));
__m128i v_2r1 = _mm_slli_epi32(v_r1, 1), v_4r1 = _mm_slli_epi32(v_r1, 2);
__m128i v_dst00 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1));
__m128i v_dst10 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2);
v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x + 4));
v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x + 4));
v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x + 4));
v_2r1 = _mm_slli_epi32(v_r1, 1);
v_4r1 = _mm_slli_epi32(v_r1, 2);
__m128i v_dst01 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1));
__m128i v_dst11 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2);
_mm_storeu_si128((__m128i *)(dst0 + x),
_mm_packus_epi32(_mm_srli_epi32(_mm_add_epi32(v_dst00, v_delta), 6),
_mm_srli_epi32(_mm_add_epi32(v_dst01, v_delta), 6)));
_mm_storeu_si128((__m128i *)(dst1 + x),
_mm_packus_epi32(_mm_srli_epi32(_mm_add_epi32(v_dst10, v_delta), 6),
_mm_srli_epi32(_mm_add_epi32(v_dst11, v_delta), 6)));
}
for( ; x <= width - 4; x += 4 )
{
__m128i v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x)),
v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x)),
v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x));
__m128i v_2r1 = _mm_slli_epi32(v_r1, 1), v_4r1 = _mm_slli_epi32(v_r1, 2);
__m128i v_dst0 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1));
__m128i v_dst1 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2);
_mm_storel_epi64((__m128i *)(dst0 + x),
_mm_packus_epi32(_mm_srli_epi32(_mm_add_epi32(v_dst0, v_delta), 6), v_zero));
_mm_storel_epi64((__m128i *)(dst1 + x),
_mm_packus_epi32(_mm_srli_epi32(_mm_add_epi32(v_dst1, v_delta), 6), v_zero));
}
return x;
}
};
#else
typedef PyrUpNoVec<int, uchar> PyrUpVec_32s8u;
typedef PyrUpNoVec<int, short> PyrUpVec_32s16s;
typedef PyrUpNoVec<int, ushort> PyrUpVec_32s16u; typedef PyrUpNoVec<int, ushort> PyrUpVec_32s16u;
typedef PyrUpNoVec<float, float> PyrUpVec_32f;
#endif // CV_SSE4_1
struct PyrUpVec_32f
{
int operator()(float** src, float** dst, int, int width) const
{
int x = 0;
if (!checkHardwareSupport(CV_CPU_SSE2))
return x;
const float *row0 = src[0], *row1 = src[1], *row2 = src[2];
float *dst0 = dst[0], *dst1 = dst[1];
__m128 v_6 = _mm_set1_ps(6.0f), v_scale = _mm_set1_ps(1.f/64.0f),
v_scale4 = _mm_mul_ps(v_scale, _mm_set1_ps(4.0f));
for( ; x <= width - 8; x += 8 )
{
__m128 v_r0 = _mm_loadu_ps(row0 + x);
__m128 v_r1 = _mm_loadu_ps(row1 + x);
__m128 v_r2 = _mm_loadu_ps(row2 + x);
_mm_storeu_ps(dst1 + x, _mm_mul_ps(v_scale4, _mm_add_ps(v_r1, v_r2)));
_mm_storeu_ps(dst0 + x, _mm_mul_ps(v_scale, _mm_add_ps(_mm_add_ps(v_r0, _mm_mul_ps(v_6, v_r1)), v_r2)));
v_r0 = _mm_loadu_ps(row0 + x + 4);
v_r1 = _mm_loadu_ps(row1 + x + 4);
v_r2 = _mm_loadu_ps(row2 + x + 4);
_mm_storeu_ps(dst1 + x + 4, _mm_mul_ps(v_scale4, _mm_add_ps(v_r1, v_r2)));
_mm_storeu_ps(dst0 + x + 4, _mm_mul_ps(v_scale, _mm_add_ps(_mm_add_ps(v_r0, _mm_mul_ps(v_6, v_r1)), v_r2)));
}
return x;
}
};
#elif CV_NEON #elif CV_NEON

View File

@ -12,6 +12,7 @@
// //
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners. // Third party copyrights are property of their respective owners.
// //
// Redistribution and use in source and binary forms, with or without modification, // Redistribution and use in source and binary forms, with or without modification,
@ -713,6 +714,156 @@ struct ColumnSum<int, ushort> :
std::vector<int> sum; std::vector<int> sum;
}; };
template<>
struct ColumnSum<int, int> :
public BaseColumnFilter
{
ColumnSum( int _ksize, int _anchor, double _scale ) :
BaseColumnFilter()
{
ksize = _ksize;
anchor = _anchor;
scale = _scale;
sumCount = 0;
}
virtual void reset() { sumCount = 0; }
virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
{
int i;
int* SUM;
bool haveScale = scale != 1;
double _scale = scale;
#if CV_SSE2
bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
#endif
if( width != (int)sum.size() )
{
sum.resize(width);
sumCount = 0;
}
SUM = &sum[0];
if( sumCount == 0 )
{
memset((void*)SUM, 0, width*sizeof(int));
for( ; sumCount < ksize - 1; sumCount++, src++ )
{
const int* Sp = (const int*)src[0];
i = 0;
#if CV_SSE2
if(haveSSE2)
{
for( ; i <= width-4; i+=4 )
{
__m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i));
__m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i));
_mm_storeu_si128((__m128i*)(SUM+i),_mm_add_epi32(_sum, _sp));
}
}
#elif CV_NEON
for( ; i <= width - 4; i+=4 )
vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)));
#endif
for( ; i < width; i++ )
SUM[i] += Sp[i];
}
}
else
{
CV_Assert( sumCount == ksize-1 );
src += ksize-1;
}
for( ; count--; src++ )
{
const int* Sp = (const int*)src[0];
const int* Sm = (const int*)src[1-ksize];
int* D = (int*)dst;
if( haveScale )
{
i = 0;
#if CV_SSE2
if(haveSSE2)
{
const __m128 scale4 = _mm_set1_ps((float)_scale);
for( ; i <= width-4; i+=4 )
{
__m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i));
__m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
_mm_loadu_si128((const __m128i*)(Sp+i)));
__m128i _s0T = _mm_cvtps_epi32(_mm_mul_ps(scale4, _mm_cvtepi32_ps(_s0)));
_mm_storeu_si128((__m128i*)(D+i), _s0T);
_mm_storeu_si128((__m128i*)(SUM+i),_mm_sub_epi32(_s0,_sm));
}
}
#elif CV_NEON
float32x4_t v_scale = vdupq_n_f32((float)_scale);
for( ; i <= width-4; i+=4 )
{
int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
int32x4_t v_s0d = cv_vrndq_s32_f32(vmulq_f32(vcvtq_f32_s32(v_s0), v_scale));
vst1q_s32(D + i, v_s0d);
vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
}
#endif
for( ; i < width; i++ )
{
int s0 = SUM[i] + Sp[i];
D[i] = saturate_cast<int>(s0*_scale);
SUM[i] = s0 - Sm[i];
}
}
else
{
i = 0;
#if CV_SSE2
if(haveSSE2)
{
for( ; i <= width-4; i+=4 )
{
__m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i));
__m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
_mm_loadu_si128((const __m128i*)(Sp+i)));
_mm_storeu_si128((__m128i*)(D+i), _s0);
_mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm));
}
}
#elif CV_NEON
for( ; i <= width-4; i+=4 )
{
int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
vst1q_s32(D + i, v_s0);
vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
}
#endif
for( ; i < width; i++ )
{
int s0 = SUM[i] + Sp[i];
D[i] = s0;
SUM[i] = s0 - Sm[i];
}
}
dst += dststep;
}
}
double scale;
int sumCount;
std::vector<int> sum;
};
template<> template<>
struct ColumnSum<int, float> : struct ColumnSum<int, float> :
public BaseColumnFilter public BaseColumnFilter

View File

@ -12,6 +12,7 @@
// //
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 2014, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners. // Third party copyrights are property of their respective owners.
// //
// Redistribution and use in source and binary forms, with or without modification, // Redistribution and use in source and binary forms, with or without modification,

View File

@ -1595,7 +1595,10 @@ void resizeArea(const cv::Mat & src, cv::Mat & dst)
TEST(Resize, Area_half) TEST(Resize, Area_half)
{ {
const int size = 1000; const int size = 1000;
int types[] = { CV_8UC1, CV_8UC4, CV_16UC1, CV_16UC4, CV_16SC1, CV_16SC4, CV_32FC1, CV_32FC4 }; int types[] = { CV_8UC1, CV_8UC4,
CV_16UC1, CV_16UC4,
CV_16SC1, CV_16SC3, CV_16SC4,
CV_32FC1, CV_32FC4 };
cv::RNG rng(17); cv::RNG rng(17);

View File

@ -64,6 +64,7 @@ TEST(Photo_SeamlessClone_normal, regression)
string original_path1 = folder + "source1.png"; string original_path1 = folder + "source1.png";
string original_path2 = folder + "destination1.png"; string original_path2 = folder + "destination1.png";
string original_path3 = folder + "mask.png"; string original_path3 = folder + "mask.png";
string reference_path = folder + "reference.png";
Mat source = imread(original_path1, IMREAD_COLOR); Mat source = imread(original_path1, IMREAD_COLOR);
Mat destination = imread(original_path2, IMREAD_COLOR); Mat destination = imread(original_path2, IMREAD_COLOR);
@ -79,8 +80,8 @@ TEST(Photo_SeamlessClone_normal, regression)
p.y = destination.size().height/2; p.y = destination.size().height/2;
seamlessClone(source, destination, mask, p, result, 1); seamlessClone(source, destination, mask, p, result, 1);
Mat reference = imread(reference_path);
Mat reference = imread(folder + "reference.png"); ASSERT_FALSE(reference.empty()) << "Could not load reference image " << reference_path;
SAVE(result); SAVE(result);
@ -94,6 +95,7 @@ TEST(Photo_SeamlessClone_mixed, regression)
string original_path1 = folder + "source1.png"; string original_path1 = folder + "source1.png";
string original_path2 = folder + "destination1.png"; string original_path2 = folder + "destination1.png";
string original_path3 = folder + "mask.png"; string original_path3 = folder + "mask.png";
string reference_path = folder + "reference.png";
Mat source = imread(original_path1, IMREAD_COLOR); Mat source = imread(original_path1, IMREAD_COLOR);
Mat destination = imread(original_path2, IMREAD_COLOR); Mat destination = imread(original_path2, IMREAD_COLOR);
@ -111,7 +113,9 @@ TEST(Photo_SeamlessClone_mixed, regression)
SAVE(result); SAVE(result);
Mat reference = imread(folder + "reference.png"); Mat reference = imread(reference_path);
ASSERT_FALSE(reference.empty()) << "Could not load reference image " << reference_path;
double error = cvtest::norm(reference, result, NORM_L1); double error = cvtest::norm(reference, result, NORM_L1);
EXPECT_LE(error, numerical_precision); EXPECT_LE(error, numerical_precision);
@ -123,6 +127,7 @@ TEST(Photo_SeamlessClone_featureExchange, regression)
string original_path1 = folder + "source1.png"; string original_path1 = folder + "source1.png";
string original_path2 = folder + "destination1.png"; string original_path2 = folder + "destination1.png";
string original_path3 = folder + "mask.png"; string original_path3 = folder + "mask.png";
string reference_path = folder + "reference.png";
Mat source = imread(original_path1, IMREAD_COLOR); Mat source = imread(original_path1, IMREAD_COLOR);
Mat destination = imread(original_path2, IMREAD_COLOR); Mat destination = imread(original_path2, IMREAD_COLOR);
@ -140,7 +145,9 @@ TEST(Photo_SeamlessClone_featureExchange, regression)
SAVE(result); SAVE(result);
Mat reference = imread(folder + "reference.png"); Mat reference = imread(reference_path);
ASSERT_FALSE(reference.empty()) << "Could not load reference image " << reference_path;
double error = cvtest::norm(reference, result, NORM_L1); double error = cvtest::norm(reference, result, NORM_L1);
EXPECT_LE(error, numerical_precision); EXPECT_LE(error, numerical_precision);
@ -151,6 +158,7 @@ TEST(Photo_SeamlessClone_colorChange, regression)
string folder = string(cvtest::TS::ptr()->get_data_path()) + "cloning/color_change/"; string folder = string(cvtest::TS::ptr()->get_data_path()) + "cloning/color_change/";
string original_path1 = folder + "source1.png"; string original_path1 = folder + "source1.png";
string original_path2 = folder + "mask.png"; string original_path2 = folder + "mask.png";
string reference_path = folder + "reference.png";
Mat source = imread(original_path1, IMREAD_COLOR); Mat source = imread(original_path1, IMREAD_COLOR);
Mat mask = imread(original_path2, IMREAD_COLOR); Mat mask = imread(original_path2, IMREAD_COLOR);
@ -163,7 +171,9 @@ TEST(Photo_SeamlessClone_colorChange, regression)
SAVE(result); SAVE(result);
Mat reference = imread(folder + "reference.png"); Mat reference = imread(reference_path);
ASSERT_FALSE(reference.empty()) << "Could not load reference image " << reference_path;
double error = cvtest::norm(reference, result, NORM_L1); double error = cvtest::norm(reference, result, NORM_L1);
EXPECT_LE(error, numerical_precision); EXPECT_LE(error, numerical_precision);
@ -174,6 +184,7 @@ TEST(Photo_SeamlessClone_illuminationChange, regression)
string folder = string(cvtest::TS::ptr()->get_data_path()) + "cloning/Illumination_Change/"; string folder = string(cvtest::TS::ptr()->get_data_path()) + "cloning/Illumination_Change/";
string original_path1 = folder + "source1.png"; string original_path1 = folder + "source1.png";
string original_path2 = folder + "mask.png"; string original_path2 = folder + "mask.png";
string reference_path = folder + "reference.png";
Mat source = imread(original_path1, IMREAD_COLOR); Mat source = imread(original_path1, IMREAD_COLOR);
Mat mask = imread(original_path2, IMREAD_COLOR); Mat mask = imread(original_path2, IMREAD_COLOR);
@ -186,7 +197,7 @@ TEST(Photo_SeamlessClone_illuminationChange, regression)
SAVE(result); SAVE(result);
Mat reference = imread(folder + "reference.png"); Mat reference = imread(reference_path);
double error = cvtest::norm(reference, result, NORM_L1); double error = cvtest::norm(reference, result, NORM_L1);
EXPECT_LE(error, numerical_precision); EXPECT_LE(error, numerical_precision);
@ -197,6 +208,7 @@ TEST(Photo_SeamlessClone_textureFlattening, regression)
string folder = string(cvtest::TS::ptr()->get_data_path()) + "cloning/Texture_Flattening/"; string folder = string(cvtest::TS::ptr()->get_data_path()) + "cloning/Texture_Flattening/";
string original_path1 = folder + "source1.png"; string original_path1 = folder + "source1.png";
string original_path2 = folder + "mask.png"; string original_path2 = folder + "mask.png";
string reference_path = folder + "reference.png";
Mat source = imread(original_path1, IMREAD_COLOR); Mat source = imread(original_path1, IMREAD_COLOR);
Mat mask = imread(original_path2, IMREAD_COLOR); Mat mask = imread(original_path2, IMREAD_COLOR);
@ -209,7 +221,9 @@ TEST(Photo_SeamlessClone_textureFlattening, regression)
SAVE(result); SAVE(result);
Mat reference = imread(folder + "reference.png"); Mat reference = imread(reference_path);
ASSERT_FALSE(reference.empty()) << "Could not load reference image " << reference_path;
double error = cvtest::norm(reference, result, NORM_L1); double error = cvtest::norm(reference, result, NORM_L1);
EXPECT_LE(error, numerical_precision); EXPECT_LE(error, numerical_precision);

View File

@ -2998,6 +2998,12 @@ void printVersionInfo(bool useStdOut)
std::string cpu_features; std::string cpu_features;
#if CV_POPCNT
if (checkHardwareSupport(CV_CPU_POPCNT)) cpu_features += " popcnt";
#endif
#if CV_MMX
if (checkHardwareSupport(CV_CPU_MMX)) cpu_features += " mmx";
#endif
#if CV_SSE #if CV_SSE
if (checkHardwareSupport(CV_CPU_SSE)) cpu_features += " sse"; if (checkHardwareSupport(CV_CPU_SSE)) cpu_features += " sse";
#endif #endif
@ -3019,6 +3025,39 @@ void printVersionInfo(bool useStdOut)
#if CV_AVX #if CV_AVX
if (checkHardwareSupport(CV_CPU_AVX)) cpu_features += " avx"; if (checkHardwareSupport(CV_CPU_AVX)) cpu_features += " avx";
#endif #endif
#if CV_AVX2
if (checkHardwareSupport(CV_CPU_AVX2)) cpu_features += " avx2";
#endif
#if CV_FMA3
if (checkHardwareSupport(CV_CPU_FMA3)) cpu_features += " fma3";
#endif
#if CV_AVX_512F
if (checkHardwareSupport(CV_CPU_AVX_512F) cpu_features += " avx-512f";
#endif
#if CV_AVX_512BW
if (checkHardwareSupport(CV_CPU_AVX_512BW) cpu_features += " avx-512bw";
#endif
#if CV_AVX_512CD
if (checkHardwareSupport(CV_CPU_AVX_512CD) cpu_features += " avx-512cd";
#endif
#if CV_AVX_512DQ
if (checkHardwareSupport(CV_CPU_AVX_512DQ) cpu_features += " avx-512dq";
#endif
#if CV_AVX_512ER
if (checkHardwareSupport(CV_CPU_AVX_512ER) cpu_features += " avx-512er";
#endif
#if CV_AVX_512IFMA512
if (checkHardwareSupport(CV_CPU_AVX_512IFMA512) cpu_features += " avx-512ifma512";
#endif
#if CV_AVX_512PF
if (checkHardwareSupport(CV_CPU_AVX_512PF) cpu_features += " avx-512pf";
#endif
#if CV_AVX_512VBMI
if (checkHardwareSupport(CV_CPU_AVX_512VBMI) cpu_features += " avx-512vbmi";
#endif
#if CV_AVX_512VL
if (checkHardwareSupport(CV_CPU_AVX_512VL) cpu_features += " avx-512vl";
#endif
#if CV_NEON #if CV_NEON
if (checkHardwareSupport(CV_CPU_NEON)) cpu_features += " neon"; if (checkHardwareSupport(CV_CPU_NEON)) cpu_features += " neon";
#endif #endif