Merge pull request #3591 from ilya-lavrenov:sse_avx
This commit is contained in:
commit
03fc3d1ceb
@ -216,11 +216,14 @@ OCV_OPTION(ENABLE_POWERPC "Enable PowerPC for GCC"
|
||||
OCV_OPTION(ENABLE_FAST_MATH "Enable -ffast-math (not recommended for GCC 4.6.x)" OFF IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) )
|
||||
OCV_OPTION(ENABLE_SSE "Enable SSE instructions" ON IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
|
||||
OCV_OPTION(ENABLE_SSE2 "Enable SSE2 instructions" ON IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
|
||||
OCV_OPTION(ENABLE_SSE3 "Enable SSE3 instructions" ON IF ((CV_ICC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
|
||||
OCV_OPTION(ENABLE_SSSE3 "Enable SSSE3 instructions" OFF IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) )
|
||||
OCV_OPTION(ENABLE_SSE41 "Enable SSE4.1 instructions" OFF IF ((CV_ICC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
|
||||
OCV_OPTION(ENABLE_SSE42 "Enable SSE4.2 instructions" OFF IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) )
|
||||
OCV_OPTION(ENABLE_SSE3 "Enable SSE3 instructions" ON IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX OR CV_ICC) AND (X86 OR X86_64)) )
|
||||
OCV_OPTION(ENABLE_SSSE3 "Enable SSSE3 instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
|
||||
OCV_OPTION(ENABLE_SSE41 "Enable SSE4.1 instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX OR CV_ICC) AND (X86 OR X86_64)) )
|
||||
OCV_OPTION(ENABLE_SSE42 "Enable SSE4.2 instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
|
||||
OCV_OPTION(ENABLE_POPCNT "Enable POPCNT instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
|
||||
OCV_OPTION(ENABLE_AVX "Enable AVX instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
|
||||
OCV_OPTION(ENABLE_AVX2 "Enable AVX2 instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
|
||||
OCV_OPTION(ENABLE_FMA3 "Enable FMA3 instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
|
||||
OCV_OPTION(ENABLE_NEON "Enable NEON instructions" OFF IF CMAKE_COMPILER_IS_GNUCXX AND (ARM OR IOS) )
|
||||
OCV_OPTION(ENABLE_VFPV3 "Enable VFPv3-D32 instructions" OFF IF CMAKE_COMPILER_IS_GNUCXX AND (ARM OR IOS) )
|
||||
OCV_OPTION(ENABLE_NOISY_WARNINGS "Show all warnings even if they are too noisy" OFF )
|
||||
|
@ -128,10 +128,10 @@ if(CMAKE_COMPILER_IS_GNUCXX)
|
||||
if(ENABLE_SSE2)
|
||||
add_extra_compiler_option(-msse2)
|
||||
endif()
|
||||
if (ENABLE_NEON)
|
||||
if(ENABLE_NEON)
|
||||
add_extra_compiler_option("-mfpu=neon")
|
||||
endif()
|
||||
if (ENABLE_VFPV3 AND NOT ENABLE_NEON)
|
||||
if(ENABLE_VFPV3 AND NOT ENABLE_NEON)
|
||||
add_extra_compiler_option("-mfpu=vfpv3")
|
||||
endif()
|
||||
|
||||
@ -140,6 +140,13 @@ if(CMAKE_COMPILER_IS_GNUCXX)
|
||||
if(ENABLE_AVX)
|
||||
add_extra_compiler_option(-mavx)
|
||||
endif()
|
||||
if(ENABLE_AVX2)
|
||||
add_extra_compiler_option(-mavx2)
|
||||
|
||||
if(ENABLE_FMA3)
|
||||
add_extra_compiler_option(-mfma)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# GCC depresses SSEx instructions when -mavx is used. Instead, it generates new AVX instructions or AVX equivalence for all SSEx instructions when needed.
|
||||
if(NOT OPENCV_EXTRA_CXX_FLAGS MATCHES "-mavx")
|
||||
@ -158,6 +165,10 @@ if(CMAKE_COMPILER_IS_GNUCXX)
|
||||
if(ENABLE_SSE42)
|
||||
add_extra_compiler_option(-msse4.2)
|
||||
endif()
|
||||
|
||||
if(ENABLE_POPCNT)
|
||||
add_extra_compiler_option(-mpopcnt)
|
||||
endif()
|
||||
endif()
|
||||
endif(NOT MINGW)
|
||||
|
||||
@ -214,7 +225,10 @@ if(MSVC)
|
||||
set(OPENCV_EXTRA_FLAGS_RELEASE "${OPENCV_EXTRA_FLAGS_RELEASE} /Zi")
|
||||
endif()
|
||||
|
||||
if(ENABLE_AVX AND NOT MSVC_VERSION LESS 1600)
|
||||
if(ENABLE_AVX2 AND NOT MSVC_VERSION LESS 1800)
|
||||
set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /arch:AVX2")
|
||||
endif()
|
||||
if(ENABLE_AVX AND NOT MSVC_VERSION LESS 1600 AND NOT OPENCV_EXTRA_FLAGS MATCHES "/arch:")
|
||||
set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /arch:AVX")
|
||||
endif()
|
||||
|
||||
@ -236,7 +250,7 @@ if(MSVC)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(ENABLE_SSE OR ENABLE_SSE2 OR ENABLE_SSE3 OR ENABLE_SSE4_1 OR ENABLE_AVX)
|
||||
if(ENABLE_SSE OR ENABLE_SSE2 OR ENABLE_SSE3 OR ENABLE_SSE4_1 OR ENABLE_AVX OR ENABLE_AVX2)
|
||||
set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /Oi")
|
||||
endif()
|
||||
|
||||
@ -308,6 +322,7 @@ if(MSVC)
|
||||
endforeach()
|
||||
|
||||
if(NOT ENABLE_NOISY_WARNINGS)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4251") #class 'std::XXX' needs to have dll-interface to be used by clients of YYY
|
||||
ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4251) # class 'std::XXX' needs to have dll-interface to be used by clients of YYY
|
||||
ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4324) # 'struct_name' : structure was padded due to __declspec(align())
|
||||
endif()
|
||||
endif()
|
||||
|
@ -13,6 +13,7 @@
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
|
||||
// Copyright (C) 2014, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
@ -813,4 +814,6 @@ inline float32x2_t cv_vsqrt_f32(float32x2_t val)
|
||||
|
||||
} // cv
|
||||
|
||||
#include "sse_utils.hpp"
|
||||
|
||||
#endif //__OPENCV_CORE_BASE_HPP__
|
||||
|
@ -13,6 +13,7 @@
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
|
||||
// Copyright (C) 2015, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
@ -104,17 +105,32 @@
|
||||
#endif
|
||||
|
||||
/* CPU features and intrinsics support */
|
||||
#define CV_CPU_NONE 0
|
||||
#define CV_CPU_MMX 1
|
||||
#define CV_CPU_SSE 2
|
||||
#define CV_CPU_SSE2 3
|
||||
#define CV_CPU_SSE3 4
|
||||
#define CV_CPU_SSSE3 5
|
||||
#define CV_CPU_SSE4_1 6
|
||||
#define CV_CPU_SSE4_2 7
|
||||
#define CV_CPU_POPCNT 8
|
||||
#define CV_CPU_AVX 10
|
||||
#define CV_CPU_NEON 11
|
||||
#define CV_CPU_NONE 0
|
||||
#define CV_CPU_MMX 1
|
||||
#define CV_CPU_SSE 2
|
||||
#define CV_CPU_SSE2 3
|
||||
#define CV_CPU_SSE3 4
|
||||
#define CV_CPU_SSSE3 5
|
||||
#define CV_CPU_SSE4_1 6
|
||||
#define CV_CPU_SSE4_2 7
|
||||
#define CV_CPU_POPCNT 8
|
||||
|
||||
#define CV_CPU_AVX 10
|
||||
#define CV_CPU_AVX2 11
|
||||
#define CV_CPU_FMA3 12
|
||||
|
||||
#define CV_CPU_AVX_512F 13
|
||||
#define CV_CPU_AVX_512BW 14
|
||||
#define CV_CPU_AVX_512CD 15
|
||||
#define CV_CPU_AVX_512DQ 16
|
||||
#define CV_CPU_AVX_512ER 17
|
||||
#define CV_CPU_AVX_512IFMA512 18
|
||||
#define CV_CPU_AVX_512PF 19
|
||||
#define CV_CPU_AVX_512VBMI 20
|
||||
#define CV_CPU_AVX_512VL 21
|
||||
|
||||
#define CV_CPU_NEON 100
|
||||
|
||||
// when adding to this list remember to update the enum in core/utility.cpp
|
||||
#define CV_HARDWARE_MAX_FEATURE 255
|
||||
|
||||
@ -123,6 +139,7 @@
|
||||
|
||||
#if defined __SSE2__ || defined _M_X64 || (defined _M_IX86_FP && _M_IX86_FP >= 2)
|
||||
# include <emmintrin.h>
|
||||
# define CV_MMX 1
|
||||
# define CV_SSE 1
|
||||
# define CV_SSE2 1
|
||||
# if defined __SSE3__ || (defined _MSC_VER && _MSC_VER >= 1500)
|
||||
@ -141,7 +158,15 @@
|
||||
# include <nmmintrin.h>
|
||||
# define CV_SSE4_2 1
|
||||
# endif
|
||||
# if defined __AVX__ || (defined _MSC_FULL_VER && _MSC_FULL_VER >= 160040219)
|
||||
# if defined __POPCNT__ || (defined _MSC_VER && _MSC_VER >= 1500)
|
||||
# ifdef _MSC_VER
|
||||
# include <nmmintrin.h>
|
||||
# else
|
||||
# include <popcntintrin.h>
|
||||
# endif
|
||||
# define CV_POPCNT 1
|
||||
# endif
|
||||
# if defined __AVX__ || (defined _MSC_VER && _MSC_VER >= 1600 && 0)
|
||||
// MS Visual Studio 2010 (2012?) has no macro pre-defined to identify the use of /arch:AVX
|
||||
// See: http://connect.microsoft.com/VisualStudio/feedback/details/605858/arch-avx-should-define-a-predefined-macro-in-x64-and-set-a-unique-value-for-m-ix86-fp-in-win32
|
||||
# include <immintrin.h>
|
||||
@ -152,6 +177,13 @@
|
||||
# define __xgetbv() 0
|
||||
# endif
|
||||
# endif
|
||||
# if defined __AVX2__ || (defined _MSC_VER && _MSC_VER >= 1800 && 0)
|
||||
# include <immintrin.h>
|
||||
# define CV_AVX2 1
|
||||
# if defined __FMA__
|
||||
# define CV_FMA3 1
|
||||
# endif
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#if (defined WIN32 || defined _WIN32) && defined(_M_ARM)
|
||||
@ -166,6 +198,12 @@
|
||||
|
||||
#endif // __CUDACC__
|
||||
|
||||
#ifndef CV_POPCNT
|
||||
#define CV_POPCNT 0
|
||||
#endif
|
||||
#ifndef CV_MMX
|
||||
# define CV_MMX 0
|
||||
#endif
|
||||
#ifndef CV_SSE
|
||||
# define CV_SSE 0
|
||||
#endif
|
||||
@ -187,6 +225,40 @@
|
||||
#ifndef CV_AVX
|
||||
# define CV_AVX 0
|
||||
#endif
|
||||
#ifndef CV_AVX2
|
||||
# define CV_AVX2 0
|
||||
#endif
|
||||
#ifndef CV_FMA3
|
||||
# define CV_FMA3 0
|
||||
#endif
|
||||
#ifndef CV_AVX_512F
|
||||
# define CV_AVX_512F 0
|
||||
#endif
|
||||
#ifndef CV_AVX_512BW
|
||||
# define CV_AVX_512BW 0
|
||||
#endif
|
||||
#ifndef CV_AVX_512CD
|
||||
# define CV_AVX_512CD 0
|
||||
#endif
|
||||
#ifndef CV_AVX_512DQ
|
||||
# define CV_AVX_512DQ 0
|
||||
#endif
|
||||
#ifndef CV_AVX_512ER
|
||||
# define CV_AVX_512ER 0
|
||||
#endif
|
||||
#ifndef CV_AVX_512IFMA512
|
||||
# define CV_AVX_512IFMA512 0
|
||||
#endif
|
||||
#ifndef CV_AVX_512PF
|
||||
# define CV_AVX_512PF 0
|
||||
#endif
|
||||
#ifndef CV_AVX_512VBMI
|
||||
# define CV_AVX_512VBMI 0
|
||||
#endif
|
||||
#ifndef CV_AVX_512VL
|
||||
# define CV_AVX_512VL 0
|
||||
#endif
|
||||
|
||||
#ifndef CV_NEON
|
||||
# define CV_NEON 0
|
||||
#endif
|
||||
|
645
modules/core/include/opencv2/core/sse_utils.hpp
Normal file
645
modules/core/include/opencv2/core/sse_utils.hpp
Normal file
@ -0,0 +1,645 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2015, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef __OPENCV_CORE_SSE_UTILS_HPP__
|
||||
#define __OPENCV_CORE_SSE_UTILS_HPP__
|
||||
|
||||
#ifndef __cplusplus
|
||||
# error sse_utils.hpp header must be compiled as C++
|
||||
#endif
|
||||
|
||||
#if CV_SSE2
|
||||
|
||||
inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
|
||||
{
|
||||
__m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_g0);
|
||||
__m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_g0);
|
||||
__m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_g1);
|
||||
__m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_g1);
|
||||
|
||||
__m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk2);
|
||||
__m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk2);
|
||||
__m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk3);
|
||||
__m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk3);
|
||||
|
||||
__m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk2);
|
||||
__m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk2);
|
||||
__m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk3);
|
||||
__m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk3);
|
||||
|
||||
__m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk2);
|
||||
__m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk2);
|
||||
__m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk3);
|
||||
__m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk3);
|
||||
|
||||
v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk2);
|
||||
v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk2);
|
||||
v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk3);
|
||||
v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk3);
|
||||
}
|
||||
|
||||
inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
|
||||
__m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
|
||||
{
|
||||
__m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_g1);
|
||||
__m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_g1);
|
||||
__m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_b0);
|
||||
__m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_b0);
|
||||
__m128i layer1_chunk4 = _mm_unpacklo_epi8(v_g0, v_b1);
|
||||
__m128i layer1_chunk5 = _mm_unpackhi_epi8(v_g0, v_b1);
|
||||
|
||||
__m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk3);
|
||||
__m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk3);
|
||||
__m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk4);
|
||||
__m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk4);
|
||||
__m128i layer2_chunk4 = _mm_unpacklo_epi8(layer1_chunk2, layer1_chunk5);
|
||||
__m128i layer2_chunk5 = _mm_unpackhi_epi8(layer1_chunk2, layer1_chunk5);
|
||||
|
||||
__m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk3);
|
||||
__m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk3);
|
||||
__m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk4);
|
||||
__m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk4);
|
||||
__m128i layer3_chunk4 = _mm_unpacklo_epi8(layer2_chunk2, layer2_chunk5);
|
||||
__m128i layer3_chunk5 = _mm_unpackhi_epi8(layer2_chunk2, layer2_chunk5);
|
||||
|
||||
__m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk3);
|
||||
__m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk3);
|
||||
__m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk4);
|
||||
__m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk4);
|
||||
__m128i layer4_chunk4 = _mm_unpacklo_epi8(layer3_chunk2, layer3_chunk5);
|
||||
__m128i layer4_chunk5 = _mm_unpackhi_epi8(layer3_chunk2, layer3_chunk5);
|
||||
|
||||
v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk3);
|
||||
v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk3);
|
||||
v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk4);
|
||||
v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk4);
|
||||
v_b0 = _mm_unpacklo_epi8(layer4_chunk2, layer4_chunk5);
|
||||
v_b1 = _mm_unpackhi_epi8(layer4_chunk2, layer4_chunk5);
|
||||
}
|
||||
|
||||
inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
|
||||
__m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
|
||||
{
|
||||
__m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_b0);
|
||||
__m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_b0);
|
||||
__m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_b1);
|
||||
__m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_b1);
|
||||
__m128i layer1_chunk4 = _mm_unpacklo_epi8(v_g0, v_a0);
|
||||
__m128i layer1_chunk5 = _mm_unpackhi_epi8(v_g0, v_a0);
|
||||
__m128i layer1_chunk6 = _mm_unpacklo_epi8(v_g1, v_a1);
|
||||
__m128i layer1_chunk7 = _mm_unpackhi_epi8(v_g1, v_a1);
|
||||
|
||||
__m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk4);
|
||||
__m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk4);
|
||||
__m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk5);
|
||||
__m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk5);
|
||||
__m128i layer2_chunk4 = _mm_unpacklo_epi8(layer1_chunk2, layer1_chunk6);
|
||||
__m128i layer2_chunk5 = _mm_unpackhi_epi8(layer1_chunk2, layer1_chunk6);
|
||||
__m128i layer2_chunk6 = _mm_unpacklo_epi8(layer1_chunk3, layer1_chunk7);
|
||||
__m128i layer2_chunk7 = _mm_unpackhi_epi8(layer1_chunk3, layer1_chunk7);
|
||||
|
||||
__m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk4);
|
||||
__m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk4);
|
||||
__m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk5);
|
||||
__m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk5);
|
||||
__m128i layer3_chunk4 = _mm_unpacklo_epi8(layer2_chunk2, layer2_chunk6);
|
||||
__m128i layer3_chunk5 = _mm_unpackhi_epi8(layer2_chunk2, layer2_chunk6);
|
||||
__m128i layer3_chunk6 = _mm_unpacklo_epi8(layer2_chunk3, layer2_chunk7);
|
||||
__m128i layer3_chunk7 = _mm_unpackhi_epi8(layer2_chunk3, layer2_chunk7);
|
||||
|
||||
__m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk4);
|
||||
__m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk4);
|
||||
__m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk5);
|
||||
__m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk5);
|
||||
__m128i layer4_chunk4 = _mm_unpacklo_epi8(layer3_chunk2, layer3_chunk6);
|
||||
__m128i layer4_chunk5 = _mm_unpackhi_epi8(layer3_chunk2, layer3_chunk6);
|
||||
__m128i layer4_chunk6 = _mm_unpacklo_epi8(layer3_chunk3, layer3_chunk7);
|
||||
__m128i layer4_chunk7 = _mm_unpackhi_epi8(layer3_chunk3, layer3_chunk7);
|
||||
|
||||
v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk4);
|
||||
v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk4);
|
||||
v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk5);
|
||||
v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk5);
|
||||
v_b0 = _mm_unpacklo_epi8(layer4_chunk2, layer4_chunk6);
|
||||
v_b1 = _mm_unpackhi_epi8(layer4_chunk2, layer4_chunk6);
|
||||
v_a0 = _mm_unpacklo_epi8(layer4_chunk3, layer4_chunk7);
|
||||
v_a1 = _mm_unpackhi_epi8(layer4_chunk3, layer4_chunk7);
|
||||
}
|
||||
|
||||
inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
|
||||
{
|
||||
__m128i v_mask = _mm_set1_epi16(0x00ff);
|
||||
|
||||
__m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
|
||||
__m128i layer4_chunk2 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8));
|
||||
__m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
|
||||
__m128i layer4_chunk3 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8));
|
||||
|
||||
__m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask));
|
||||
__m128i layer3_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8));
|
||||
__m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask));
|
||||
__m128i layer3_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8));
|
||||
|
||||
__m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
|
||||
__m128i layer2_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8));
|
||||
__m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
|
||||
__m128i layer2_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8));
|
||||
|
||||
__m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
|
||||
__m128i layer1_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8));
|
||||
__m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
|
||||
__m128i layer1_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8));
|
||||
|
||||
v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
|
||||
v_g0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8));
|
||||
v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
|
||||
v_g1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8));
|
||||
}
|
||||
|
||||
inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
|
||||
__m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
|
||||
{
|
||||
__m128i v_mask = _mm_set1_epi16(0x00ff);
|
||||
|
||||
__m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
|
||||
__m128i layer4_chunk3 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8));
|
||||
__m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
|
||||
__m128i layer4_chunk4 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8));
|
||||
__m128i layer4_chunk2 = _mm_packus_epi16(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
|
||||
__m128i layer4_chunk5 = _mm_packus_epi16(_mm_srli_epi16(v_b0, 8), _mm_srli_epi16(v_b1, 8));
|
||||
|
||||
__m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask));
|
||||
__m128i layer3_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8));
|
||||
__m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask));
|
||||
__m128i layer3_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8));
|
||||
__m128i layer3_chunk2 = _mm_packus_epi16(_mm_and_si128(layer4_chunk4, v_mask), _mm_and_si128(layer4_chunk5, v_mask));
|
||||
__m128i layer3_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk4, 8), _mm_srli_epi16(layer4_chunk5, 8));
|
||||
|
||||
__m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
|
||||
__m128i layer2_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8));
|
||||
__m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
|
||||
__m128i layer2_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8));
|
||||
__m128i layer2_chunk2 = _mm_packus_epi16(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
|
||||
__m128i layer2_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk4, 8), _mm_srli_epi16(layer3_chunk5, 8));
|
||||
|
||||
__m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
|
||||
__m128i layer1_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8));
|
||||
__m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
|
||||
__m128i layer1_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8));
|
||||
__m128i layer1_chunk2 = _mm_packus_epi16(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
|
||||
__m128i layer1_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk4, 8), _mm_srli_epi16(layer2_chunk5, 8));
|
||||
|
||||
v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
|
||||
v_g1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8));
|
||||
v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
|
||||
v_b0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8));
|
||||
v_g0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
|
||||
v_b1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk4, 8), _mm_srli_epi16(layer1_chunk5, 8));
|
||||
}
|
||||
|
||||
inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
|
||||
__m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
|
||||
{
|
||||
__m128i v_mask = _mm_set1_epi16(0x00ff);
|
||||
|
||||
__m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
|
||||
__m128i layer4_chunk4 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8));
|
||||
__m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
|
||||
__m128i layer4_chunk5 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8));
|
||||
__m128i layer4_chunk2 = _mm_packus_epi16(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
|
||||
__m128i layer4_chunk6 = _mm_packus_epi16(_mm_srli_epi16(v_b0, 8), _mm_srli_epi16(v_b1, 8));
|
||||
__m128i layer4_chunk3 = _mm_packus_epi16(_mm_and_si128(v_a0, v_mask), _mm_and_si128(v_a1, v_mask));
|
||||
__m128i layer4_chunk7 = _mm_packus_epi16(_mm_srli_epi16(v_a0, 8), _mm_srli_epi16(v_a1, 8));
|
||||
|
||||
__m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask));
|
||||
__m128i layer3_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8));
|
||||
__m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask));
|
||||
__m128i layer3_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8));
|
||||
__m128i layer3_chunk2 = _mm_packus_epi16(_mm_and_si128(layer4_chunk4, v_mask), _mm_and_si128(layer4_chunk5, v_mask));
|
||||
__m128i layer3_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk4, 8), _mm_srli_epi16(layer4_chunk5, 8));
|
||||
__m128i layer3_chunk3 = _mm_packus_epi16(_mm_and_si128(layer4_chunk6, v_mask), _mm_and_si128(layer4_chunk7, v_mask));
|
||||
__m128i layer3_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk6, 8), _mm_srli_epi16(layer4_chunk7, 8));
|
||||
|
||||
__m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
|
||||
__m128i layer2_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8));
|
||||
__m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
|
||||
__m128i layer2_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8));
|
||||
__m128i layer2_chunk2 = _mm_packus_epi16(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
|
||||
__m128i layer2_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk4, 8), _mm_srli_epi16(layer3_chunk5, 8));
|
||||
__m128i layer2_chunk3 = _mm_packus_epi16(_mm_and_si128(layer3_chunk6, v_mask), _mm_and_si128(layer3_chunk7, v_mask));
|
||||
__m128i layer2_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk6, 8), _mm_srli_epi16(layer3_chunk7, 8));
|
||||
|
||||
__m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
|
||||
__m128i layer1_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8));
|
||||
__m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
|
||||
__m128i layer1_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8));
|
||||
__m128i layer1_chunk2 = _mm_packus_epi16(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
|
||||
__m128i layer1_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk4, 8), _mm_srli_epi16(layer2_chunk5, 8));
|
||||
__m128i layer1_chunk3 = _mm_packus_epi16(_mm_and_si128(layer2_chunk6, v_mask), _mm_and_si128(layer2_chunk7, v_mask));
|
||||
__m128i layer1_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk6, 8), _mm_srli_epi16(layer2_chunk7, 8));
|
||||
|
||||
v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
|
||||
v_b0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8));
|
||||
v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
|
||||
v_b1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8));
|
||||
v_g0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
|
||||
v_a0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk4, 8), _mm_srli_epi16(layer1_chunk5, 8));
|
||||
v_g1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk6, v_mask), _mm_and_si128(layer1_chunk7, v_mask));
|
||||
v_a1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk6, 8), _mm_srli_epi16(layer1_chunk7, 8));
|
||||
}
|
||||
|
||||
inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
|
||||
{
|
||||
__m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_g0);
|
||||
__m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_g0);
|
||||
__m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_g1);
|
||||
__m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_g1);
|
||||
|
||||
__m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk2);
|
||||
__m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk2);
|
||||
__m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk3);
|
||||
__m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk3);
|
||||
|
||||
__m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk2);
|
||||
__m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk2);
|
||||
__m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk3);
|
||||
__m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk3);
|
||||
|
||||
v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk2);
|
||||
v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk2);
|
||||
v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk3);
|
||||
v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk3);
|
||||
}
|
||||
|
||||
inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
|
||||
__m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
|
||||
{
|
||||
__m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_g1);
|
||||
__m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_g1);
|
||||
__m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_b0);
|
||||
__m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_b0);
|
||||
__m128i layer1_chunk4 = _mm_unpacklo_epi16(v_g0, v_b1);
|
||||
__m128i layer1_chunk5 = _mm_unpackhi_epi16(v_g0, v_b1);
|
||||
|
||||
__m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk3);
|
||||
__m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk3);
|
||||
__m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk4);
|
||||
__m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk4);
|
||||
__m128i layer2_chunk4 = _mm_unpacklo_epi16(layer1_chunk2, layer1_chunk5);
|
||||
__m128i layer2_chunk5 = _mm_unpackhi_epi16(layer1_chunk2, layer1_chunk5);
|
||||
|
||||
__m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk3);
|
||||
__m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk3);
|
||||
__m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk4);
|
||||
__m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk4);
|
||||
__m128i layer3_chunk4 = _mm_unpacklo_epi16(layer2_chunk2, layer2_chunk5);
|
||||
__m128i layer3_chunk5 = _mm_unpackhi_epi16(layer2_chunk2, layer2_chunk5);
|
||||
|
||||
v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk3);
|
||||
v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk3);
|
||||
v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk4);
|
||||
v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk4);
|
||||
v_b0 = _mm_unpacklo_epi16(layer3_chunk2, layer3_chunk5);
|
||||
v_b1 = _mm_unpackhi_epi16(layer3_chunk2, layer3_chunk5);
|
||||
}
|
||||
|
||||
inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
|
||||
__m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
|
||||
{
|
||||
__m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_b0);
|
||||
__m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_b0);
|
||||
__m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_b1);
|
||||
__m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_b1);
|
||||
__m128i layer1_chunk4 = _mm_unpacklo_epi16(v_g0, v_a0);
|
||||
__m128i layer1_chunk5 = _mm_unpackhi_epi16(v_g0, v_a0);
|
||||
__m128i layer1_chunk6 = _mm_unpacklo_epi16(v_g1, v_a1);
|
||||
__m128i layer1_chunk7 = _mm_unpackhi_epi16(v_g1, v_a1);
|
||||
|
||||
__m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk4);
|
||||
__m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk4);
|
||||
__m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk5);
|
||||
__m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk5);
|
||||
__m128i layer2_chunk4 = _mm_unpacklo_epi16(layer1_chunk2, layer1_chunk6);
|
||||
__m128i layer2_chunk5 = _mm_unpackhi_epi16(layer1_chunk2, layer1_chunk6);
|
||||
__m128i layer2_chunk6 = _mm_unpacklo_epi16(layer1_chunk3, layer1_chunk7);
|
||||
__m128i layer2_chunk7 = _mm_unpackhi_epi16(layer1_chunk3, layer1_chunk7);
|
||||
|
||||
__m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk4);
|
||||
__m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk4);
|
||||
__m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk5);
|
||||
__m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk5);
|
||||
__m128i layer3_chunk4 = _mm_unpacklo_epi16(layer2_chunk2, layer2_chunk6);
|
||||
__m128i layer3_chunk5 = _mm_unpackhi_epi16(layer2_chunk2, layer2_chunk6);
|
||||
__m128i layer3_chunk6 = _mm_unpacklo_epi16(layer2_chunk3, layer2_chunk7);
|
||||
__m128i layer3_chunk7 = _mm_unpackhi_epi16(layer2_chunk3, layer2_chunk7);
|
||||
|
||||
v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk4);
|
||||
v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk4);
|
||||
v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk5);
|
||||
v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk5);
|
||||
v_b0 = _mm_unpacklo_epi16(layer3_chunk2, layer3_chunk6);
|
||||
v_b1 = _mm_unpackhi_epi16(layer3_chunk2, layer3_chunk6);
|
||||
v_a0 = _mm_unpacklo_epi16(layer3_chunk3, layer3_chunk7);
|
||||
v_a1 = _mm_unpackhi_epi16(layer3_chunk3, layer3_chunk7);
|
||||
}
|
||||
|
||||
#if CV_SSE4_1
|
||||
|
||||
inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
|
||||
{
|
||||
__m128i v_mask = _mm_set1_epi32(0x0000ffff);
|
||||
|
||||
__m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
|
||||
__m128i layer3_chunk2 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16));
|
||||
__m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
|
||||
__m128i layer3_chunk3 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16));
|
||||
|
||||
__m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
|
||||
__m128i layer2_chunk2 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16));
|
||||
__m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
|
||||
__m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16));
|
||||
|
||||
__m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
|
||||
__m128i layer1_chunk2 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16));
|
||||
__m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
|
||||
__m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16));
|
||||
|
||||
v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
|
||||
v_g0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16));
|
||||
v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
|
||||
v_g1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16));
|
||||
}
|
||||
|
||||
inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
|
||||
__m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
|
||||
{
|
||||
__m128i v_mask = _mm_set1_epi32(0x0000ffff);
|
||||
|
||||
__m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
|
||||
__m128i layer3_chunk3 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16));
|
||||
__m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
|
||||
__m128i layer3_chunk4 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16));
|
||||
__m128i layer3_chunk2 = _mm_packus_epi32(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
|
||||
__m128i layer3_chunk5 = _mm_packus_epi32(_mm_srli_epi32(v_b0, 16), _mm_srli_epi32(v_b1, 16));
|
||||
|
||||
__m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
|
||||
__m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16));
|
||||
__m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
|
||||
__m128i layer2_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16));
|
||||
__m128i layer2_chunk2 = _mm_packus_epi32(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
|
||||
__m128i layer2_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk4, 16), _mm_srli_epi32(layer3_chunk5, 16));
|
||||
|
||||
__m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
|
||||
__m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16));
|
||||
__m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
|
||||
__m128i layer1_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16));
|
||||
__m128i layer1_chunk2 = _mm_packus_epi32(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
|
||||
__m128i layer1_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk4, 16), _mm_srli_epi32(layer2_chunk5, 16));
|
||||
|
||||
v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
|
||||
v_g1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16));
|
||||
v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
|
||||
v_b0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16));
|
||||
v_g0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
|
||||
v_b1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk4, 16), _mm_srli_epi32(layer1_chunk5, 16));
|
||||
}
|
||||
|
||||
inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
|
||||
__m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
|
||||
{
|
||||
__m128i v_mask = _mm_set1_epi32(0x0000ffff);
|
||||
|
||||
__m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
|
||||
__m128i layer3_chunk4 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16));
|
||||
__m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
|
||||
__m128i layer3_chunk5 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16));
|
||||
__m128i layer3_chunk2 = _mm_packus_epi32(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
|
||||
__m128i layer3_chunk6 = _mm_packus_epi32(_mm_srli_epi32(v_b0, 16), _mm_srli_epi32(v_b1, 16));
|
||||
__m128i layer3_chunk3 = _mm_packus_epi32(_mm_and_si128(v_a0, v_mask), _mm_and_si128(v_a1, v_mask));
|
||||
__m128i layer3_chunk7 = _mm_packus_epi32(_mm_srli_epi32(v_a0, 16), _mm_srli_epi32(v_a1, 16));
|
||||
|
||||
__m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
|
||||
__m128i layer2_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16));
|
||||
__m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
|
||||
__m128i layer2_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16));
|
||||
__m128i layer2_chunk2 = _mm_packus_epi32(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
|
||||
__m128i layer2_chunk6 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk4, 16), _mm_srli_epi32(layer3_chunk5, 16));
|
||||
__m128i layer2_chunk3 = _mm_packus_epi32(_mm_and_si128(layer3_chunk6, v_mask), _mm_and_si128(layer3_chunk7, v_mask));
|
||||
__m128i layer2_chunk7 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk6, 16), _mm_srli_epi32(layer3_chunk7, 16));
|
||||
|
||||
__m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
|
||||
__m128i layer1_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16));
|
||||
__m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
|
||||
__m128i layer1_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16));
|
||||
__m128i layer1_chunk2 = _mm_packus_epi32(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
|
||||
__m128i layer1_chunk6 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk4, 16), _mm_srli_epi32(layer2_chunk5, 16));
|
||||
__m128i layer1_chunk3 = _mm_packus_epi32(_mm_and_si128(layer2_chunk6, v_mask), _mm_and_si128(layer2_chunk7, v_mask));
|
||||
__m128i layer1_chunk7 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk6, 16), _mm_srli_epi32(layer2_chunk7, 16));
|
||||
|
||||
v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
|
||||
v_b0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16));
|
||||
v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
|
||||
v_b1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16));
|
||||
v_g0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
|
||||
v_a0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk4, 16), _mm_srli_epi32(layer1_chunk5, 16));
|
||||
v_g1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk6, v_mask), _mm_and_si128(layer1_chunk7, v_mask));
|
||||
v_a1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk6, 16), _mm_srli_epi32(layer1_chunk7, 16));
|
||||
}
|
||||
|
||||
#endif // CV_SSE4_1
|
||||
|
||||
inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1)
|
||||
{
|
||||
__m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_g0);
|
||||
__m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_g0);
|
||||
__m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_g1);
|
||||
__m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_g1);
|
||||
|
||||
__m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk2);
|
||||
__m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk2);
|
||||
__m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk3);
|
||||
__m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk3);
|
||||
|
||||
v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk2);
|
||||
v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk2);
|
||||
v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk3);
|
||||
v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk3);
|
||||
}
|
||||
|
||||
inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0,
|
||||
__m128 & v_g1, __m128 & v_b0, __m128 & v_b1)
|
||||
{
|
||||
__m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_g1);
|
||||
__m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_g1);
|
||||
__m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_b0);
|
||||
__m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_b0);
|
||||
__m128 layer1_chunk4 = _mm_unpacklo_ps(v_g0, v_b1);
|
||||
__m128 layer1_chunk5 = _mm_unpackhi_ps(v_g0, v_b1);
|
||||
|
||||
__m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk3);
|
||||
__m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk3);
|
||||
__m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk4);
|
||||
__m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk4);
|
||||
__m128 layer2_chunk4 = _mm_unpacklo_ps(layer1_chunk2, layer1_chunk5);
|
||||
__m128 layer2_chunk5 = _mm_unpackhi_ps(layer1_chunk2, layer1_chunk5);
|
||||
|
||||
v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk3);
|
||||
v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk3);
|
||||
v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk4);
|
||||
v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk4);
|
||||
v_b0 = _mm_unpacklo_ps(layer2_chunk2, layer2_chunk5);
|
||||
v_b1 = _mm_unpackhi_ps(layer2_chunk2, layer2_chunk5);
|
||||
}
|
||||
|
||||
inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1,
|
||||
__m128 & v_b0, __m128 & v_b1, __m128 & v_a0, __m128 & v_a1)
|
||||
{
|
||||
__m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_b0);
|
||||
__m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_b0);
|
||||
__m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_b1);
|
||||
__m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_b1);
|
||||
__m128 layer1_chunk4 = _mm_unpacklo_ps(v_g0, v_a0);
|
||||
__m128 layer1_chunk5 = _mm_unpackhi_ps(v_g0, v_a0);
|
||||
__m128 layer1_chunk6 = _mm_unpacklo_ps(v_g1, v_a1);
|
||||
__m128 layer1_chunk7 = _mm_unpackhi_ps(v_g1, v_a1);
|
||||
|
||||
__m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk4);
|
||||
__m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk4);
|
||||
__m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk5);
|
||||
__m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk5);
|
||||
__m128 layer2_chunk4 = _mm_unpacklo_ps(layer1_chunk2, layer1_chunk6);
|
||||
__m128 layer2_chunk5 = _mm_unpackhi_ps(layer1_chunk2, layer1_chunk6);
|
||||
__m128 layer2_chunk6 = _mm_unpacklo_ps(layer1_chunk3, layer1_chunk7);
|
||||
__m128 layer2_chunk7 = _mm_unpackhi_ps(layer1_chunk3, layer1_chunk7);
|
||||
|
||||
v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk4);
|
||||
v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk4);
|
||||
v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk5);
|
||||
v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk5);
|
||||
v_b0 = _mm_unpacklo_ps(layer2_chunk2, layer2_chunk6);
|
||||
v_b1 = _mm_unpackhi_ps(layer2_chunk2, layer2_chunk6);
|
||||
v_a0 = _mm_unpacklo_ps(layer2_chunk3, layer2_chunk7);
|
||||
v_a1 = _mm_unpackhi_ps(layer2_chunk3, layer2_chunk7);
|
||||
}
|
||||
|
||||
inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1)
|
||||
{
|
||||
const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
|
||||
|
||||
__m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo);
|
||||
__m128 layer2_chunk2 = _mm_shuffle_ps(v_r0, v_r1, mask_hi);
|
||||
__m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo);
|
||||
__m128 layer2_chunk3 = _mm_shuffle_ps(v_g0, v_g1, mask_hi);
|
||||
|
||||
__m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo);
|
||||
__m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi);
|
||||
__m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo);
|
||||
__m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi);
|
||||
|
||||
v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo);
|
||||
v_g0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi);
|
||||
v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo);
|
||||
v_g1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi);
|
||||
}
|
||||
|
||||
inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0,
|
||||
__m128 & v_g1, __m128 & v_b0, __m128 & v_b1)
|
||||
{
|
||||
const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
|
||||
|
||||
__m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo);
|
||||
__m128 layer2_chunk3 = _mm_shuffle_ps(v_r0, v_r1, mask_hi);
|
||||
__m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo);
|
||||
__m128 layer2_chunk4 = _mm_shuffle_ps(v_g0, v_g1, mask_hi);
|
||||
__m128 layer2_chunk2 = _mm_shuffle_ps(v_b0, v_b1, mask_lo);
|
||||
__m128 layer2_chunk5 = _mm_shuffle_ps(v_b0, v_b1, mask_hi);
|
||||
|
||||
__m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo);
|
||||
__m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi);
|
||||
__m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo);
|
||||
__m128 layer1_chunk4 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi);
|
||||
__m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_lo);
|
||||
__m128 layer1_chunk5 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_hi);
|
||||
|
||||
v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo);
|
||||
v_g1 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi);
|
||||
v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo);
|
||||
v_b0 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi);
|
||||
v_g0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_lo);
|
||||
v_b1 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_hi);
|
||||
}
|
||||
|
||||
inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1,
|
||||
__m128 & v_b0, __m128 & v_b1, __m128 & v_a0, __m128 & v_a1)
|
||||
{
|
||||
const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
|
||||
|
||||
__m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo);
|
||||
__m128 layer2_chunk4 = _mm_shuffle_ps(v_r0, v_r1, mask_hi);
|
||||
__m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo);
|
||||
__m128 layer2_chunk5 = _mm_shuffle_ps(v_g0, v_g1, mask_hi);
|
||||
__m128 layer2_chunk2 = _mm_shuffle_ps(v_b0, v_b1, mask_lo);
|
||||
__m128 layer2_chunk6 = _mm_shuffle_ps(v_b0, v_b1, mask_hi);
|
||||
__m128 layer2_chunk3 = _mm_shuffle_ps(v_a0, v_a1, mask_lo);
|
||||
__m128 layer2_chunk7 = _mm_shuffle_ps(v_a0, v_a1, mask_hi);
|
||||
|
||||
__m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo);
|
||||
__m128 layer1_chunk4 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi);
|
||||
__m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo);
|
||||
__m128 layer1_chunk5 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi);
|
||||
__m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_lo);
|
||||
__m128 layer1_chunk6 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_hi);
|
||||
__m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk6, layer2_chunk7, mask_lo);
|
||||
__m128 layer1_chunk7 = _mm_shuffle_ps(layer2_chunk6, layer2_chunk7, mask_hi);
|
||||
|
||||
v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo);
|
||||
v_b0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi);
|
||||
v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo);
|
||||
v_b1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi);
|
||||
v_g0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_lo);
|
||||
v_a0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_hi);
|
||||
v_g1 = _mm_shuffle_ps(layer1_chunk6, layer1_chunk7, mask_lo);
|
||||
v_a1 = _mm_shuffle_ps(layer1_chunk6, layer1_chunk7, mask_hi);
|
||||
}
|
||||
|
||||
#endif // CV_SSE2
|
||||
|
||||
#endif //__OPENCV_CORE_SSE_UTILS_HPP__
|
@ -13,6 +13,7 @@
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
|
||||
// Copyright (C) 2015, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
@ -281,16 +282,30 @@ CV_EXPORTS_W int64 getCPUTickCount();
|
||||
remember to keep this list identical to the one in cvdef.h
|
||||
*/
|
||||
enum CpuFeatures {
|
||||
CPU_MMX = 1,
|
||||
CPU_SSE = 2,
|
||||
CPU_SSE2 = 3,
|
||||
CPU_SSE3 = 4,
|
||||
CPU_SSSE3 = 5,
|
||||
CPU_SSE4_1 = 6,
|
||||
CPU_SSE4_2 = 7,
|
||||
CPU_POPCNT = 8,
|
||||
CPU_AVX = 10,
|
||||
CPU_NEON = 11
|
||||
CPU_MMX = 1,
|
||||
CPU_SSE = 2,
|
||||
CPU_SSE2 = 3,
|
||||
CPU_SSE3 = 4,
|
||||
CPU_SSSE3 = 5,
|
||||
CPU_SSE4_1 = 6,
|
||||
CPU_SSE4_2 = 7,
|
||||
CPU_POPCNT = 8,
|
||||
|
||||
CPU_AVX = 10,
|
||||
CPU_AVX2 = 11,
|
||||
CPU_FMA3 = 12,
|
||||
|
||||
CPU_AVX_512F = 13,
|
||||
CPU_AVX_512BW = 14,
|
||||
CPU_AVX_512CD = 15,
|
||||
CPU_AVX_512DQ = 16,
|
||||
CPU_AVX_512ER = 17,
|
||||
CPU_AVX_512IFMA512 = 18,
|
||||
CPU_AVX_512PF = 19,
|
||||
CPU_AVX_512VBMI = 20,
|
||||
CPU_AVX_512VL = 21,
|
||||
|
||||
CPU_NEON = 100
|
||||
};
|
||||
|
||||
/** @brief Returns true if the specified feature is supported by the host hardware.
|
||||
|
@ -242,3 +242,31 @@ PERF_TEST_P(Size_MatType, multiplyScale, TYPICAL_MATS_CORE_ARITHM)
|
||||
|
||||
SANITY_CHECK(c, 1e-8);
|
||||
}
|
||||
|
||||
PERF_TEST_P(Size_MatType, divide, TYPICAL_MATS_CORE_ARITHM)
|
||||
{
|
||||
Size sz = get<0>(GetParam());
|
||||
int type = get<1>(GetParam());
|
||||
cv::Mat a(sz, type), b(sz, type), c(sz, type);
|
||||
double scale = 0.5;
|
||||
|
||||
declare.in(a, b, WARMUP_RNG).out(c);
|
||||
|
||||
TEST_CYCLE() divide(a, b, c, scale);
|
||||
|
||||
SANITY_CHECK_NOTHING();
|
||||
}
|
||||
|
||||
PERF_TEST_P(Size_MatType, reciprocal, TYPICAL_MATS_CORE_ARITHM)
|
||||
{
|
||||
Size sz = get<0>(GetParam());
|
||||
int type = get<1>(GetParam());
|
||||
cv::Mat b(sz, type), c(sz, type);
|
||||
double scale = 0.5;
|
||||
|
||||
declare.in(b, WARMUP_RNG).out(c);
|
||||
|
||||
TEST_CYCLE() divide(scale, b, c);
|
||||
|
||||
SANITY_CHECK_NOTHING();
|
||||
}
|
||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -11,6 +11,7 @@
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2014, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
|
@ -12,6 +12,7 @@
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
@ -593,14 +594,46 @@ void phase( InputArray src1, InputArray src2, OutputArray dst, bool angleInDegre
|
||||
{
|
||||
const double *x = (const double*)ptrs[0], *y = (const double*)ptrs[1];
|
||||
double *angle = (double*)ptrs[2];
|
||||
for( k = 0; k < len; k++ )
|
||||
k = 0;
|
||||
|
||||
#if CV_SSE2
|
||||
if (USE_SSE2)
|
||||
{
|
||||
for ( ; k <= len - 4; k += 4)
|
||||
{
|
||||
__m128 v_dst0 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(x + k)),
|
||||
_mm_cvtpd_ps(_mm_loadu_pd(x + k + 2)));
|
||||
__m128 v_dst1 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(y + k)),
|
||||
_mm_cvtpd_ps(_mm_loadu_pd(y + k + 2)));
|
||||
|
||||
_mm_storeu_ps(buf[0] + k, v_dst0);
|
||||
_mm_storeu_ps(buf[1] + k, v_dst1);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for( ; k < len; k++ )
|
||||
{
|
||||
buf[0][k] = (float)x[k];
|
||||
buf[1][k] = (float)y[k];
|
||||
}
|
||||
|
||||
FastAtan2_32f( buf[1], buf[0], buf[0], len, angleInDegrees );
|
||||
for( k = 0; k < len; k++ )
|
||||
k = 0;
|
||||
|
||||
#if CV_SSE2
|
||||
if (USE_SSE2)
|
||||
{
|
||||
for ( ; k <= len - 4; k += 4)
|
||||
{
|
||||
__m128 v_src = _mm_loadu_ps(buf[0] + k);
|
||||
_mm_storeu_pd(angle + k, _mm_cvtps_pd(v_src));
|
||||
_mm_storeu_pd(angle + k + 2, _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8))));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for( ; k < len; k++ )
|
||||
angle[k] = buf[0][k];
|
||||
}
|
||||
ptrs[0] += len*esz1;
|
||||
@ -698,14 +731,46 @@ void cartToPolar( InputArray src1, InputArray src2,
|
||||
double *angle = (double*)ptrs[3];
|
||||
|
||||
Magnitude_64f(x, y, (double*)ptrs[2], len);
|
||||
for( k = 0; k < len; k++ )
|
||||
k = 0;
|
||||
|
||||
#if CV_SSE2
|
||||
if (USE_SSE2)
|
||||
{
|
||||
for ( ; k <= len - 4; k += 4)
|
||||
{
|
||||
__m128 v_dst0 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(x + k)),
|
||||
_mm_cvtpd_ps(_mm_loadu_pd(x + k + 2)));
|
||||
__m128 v_dst1 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(y + k)),
|
||||
_mm_cvtpd_ps(_mm_loadu_pd(y + k + 2)));
|
||||
|
||||
_mm_storeu_ps(buf[0] + k, v_dst0);
|
||||
_mm_storeu_ps(buf[1] + k, v_dst1);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for( ; k < len; k++ )
|
||||
{
|
||||
buf[0][k] = (float)x[k];
|
||||
buf[1][k] = (float)y[k];
|
||||
}
|
||||
|
||||
FastAtan2_32f( buf[1], buf[0], buf[0], len, angleInDegrees );
|
||||
for( k = 0; k < len; k++ )
|
||||
k = 0;
|
||||
|
||||
#if CV_SSE2
|
||||
if (USE_SSE2)
|
||||
{
|
||||
for ( ; k <= len - 4; k += 4)
|
||||
{
|
||||
__m128 v_src = _mm_loadu_ps(buf[0] + k);
|
||||
_mm_storeu_pd(angle + k, _mm_cvtps_pd(v_src));
|
||||
_mm_storeu_pd(angle + k + 2, _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8))));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for( ; k < len; k++ )
|
||||
angle[k] = buf[0][k];
|
||||
}
|
||||
ptrs[0] += len*esz1;
|
||||
@ -771,14 +836,77 @@ static void SinCos_32f( const float *angle, float *sinval, float* cosval,
|
||||
/*static const double cos_a2 = 1;*/
|
||||
|
||||
double k1;
|
||||
int i;
|
||||
int i = 0;
|
||||
|
||||
if( !angle_in_degrees )
|
||||
k1 = N/(2*CV_PI);
|
||||
else
|
||||
k1 = N/360.;
|
||||
|
||||
for( i = 0; i < len; i++ )
|
||||
#if CV_AVX2
|
||||
if (USE_AVX2)
|
||||
{
|
||||
__m128d v_k1 = _mm_set1_pd(k1);
|
||||
__m128d v_1 = _mm_set1_pd(1);
|
||||
__m128i v_N1 = _mm_set1_epi32(N - 1);
|
||||
__m128i v_N4 = _mm_set1_epi32(N >> 2);
|
||||
__m128d v_sin_a0 = _mm_set1_pd(sin_a0);
|
||||
__m128d v_sin_a2 = _mm_set1_pd(sin_a2);
|
||||
__m128d v_cos_a0 = _mm_set1_pd(cos_a0);
|
||||
|
||||
for ( ; i <= len - 4; i += 4)
|
||||
{
|
||||
__m128 v_angle = _mm_loadu_ps(angle + i);
|
||||
|
||||
// 0-1
|
||||
__m128d v_t = _mm_mul_pd(_mm_cvtps_pd(v_angle), v_k1);
|
||||
__m128i v_it = _mm_cvtpd_epi32(v_t);
|
||||
v_t = _mm_sub_pd(v_t, _mm_cvtepi32_pd(v_it));
|
||||
|
||||
__m128i v_sin_idx = _mm_and_si128(v_it, v_N1);
|
||||
__m128i v_cos_idx = _mm_and_si128(_mm_sub_epi32(v_N4, v_sin_idx), v_N1);
|
||||
|
||||
__m128d v_t2 = _mm_mul_pd(v_t, v_t);
|
||||
__m128d v_sin_b = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(v_sin_a0, v_t2), v_sin_a2), v_t);
|
||||
__m128d v_cos_b = _mm_add_pd(_mm_mul_pd(v_cos_a0, v_t2), v_1);
|
||||
|
||||
__m128d v_sin_a = _mm_i32gather_pd(sin_table, v_sin_idx, 8);
|
||||
__m128d v_cos_a = _mm_i32gather_pd(sin_table, v_cos_idx, 8);
|
||||
|
||||
__m128d v_sin_val_0 = _mm_add_pd(_mm_mul_pd(v_sin_a, v_cos_b),
|
||||
_mm_mul_pd(v_cos_a, v_sin_b));
|
||||
__m128d v_cos_val_0 = _mm_sub_pd(_mm_mul_pd(v_cos_a, v_cos_b),
|
||||
_mm_mul_pd(v_sin_a, v_sin_b));
|
||||
|
||||
// 2-3
|
||||
v_t = _mm_mul_pd(_mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_angle), 8))), v_k1);
|
||||
v_it = _mm_cvtpd_epi32(v_t);
|
||||
v_t = _mm_sub_pd(v_t, _mm_cvtepi32_pd(v_it));
|
||||
|
||||
v_sin_idx = _mm_and_si128(v_it, v_N1);
|
||||
v_cos_idx = _mm_and_si128(_mm_sub_epi32(v_N4, v_sin_idx), v_N1);
|
||||
|
||||
v_t2 = _mm_mul_pd(v_t, v_t);
|
||||
v_sin_b = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(v_sin_a0, v_t2), v_sin_a2), v_t);
|
||||
v_cos_b = _mm_add_pd(_mm_mul_pd(v_cos_a0, v_t2), v_1);
|
||||
|
||||
v_sin_a = _mm_i32gather_pd(sin_table, v_sin_idx, 8);
|
||||
v_cos_a = _mm_i32gather_pd(sin_table, v_cos_idx, 8);
|
||||
|
||||
__m128d v_sin_val_1 = _mm_add_pd(_mm_mul_pd(v_sin_a, v_cos_b),
|
||||
_mm_mul_pd(v_cos_a, v_sin_b));
|
||||
__m128d v_cos_val_1 = _mm_sub_pd(_mm_mul_pd(v_cos_a, v_cos_b),
|
||||
_mm_mul_pd(v_sin_a, v_sin_b));
|
||||
|
||||
_mm_storeu_ps(sinval + i, _mm_movelh_ps(_mm_cvtpd_ps(v_sin_val_0),
|
||||
_mm_cvtpd_ps(v_sin_val_1)));
|
||||
_mm_storeu_ps(cosval + i, _mm_movelh_ps(_mm_cvtpd_ps(v_cos_val_0),
|
||||
_mm_cvtpd_ps(v_cos_val_1)));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for( ; i < len; i++ )
|
||||
{
|
||||
double t = angle[i]*k1;
|
||||
int it = cvRound(t);
|
||||
@ -914,6 +1042,16 @@ void polarToCart( InputArray src1, InputArray src2,
|
||||
vst1q_f32(x + k, vmulq_f32(vld1q_f32(x + k), v_m));
|
||||
vst1q_f32(y + k, vmulq_f32(vld1q_f32(y + k), v_m));
|
||||
}
|
||||
#elif CV_SSE2
|
||||
if (USE_SSE2)
|
||||
{
|
||||
for( ; k <= len - 4; k += 4 )
|
||||
{
|
||||
__m128 v_m = _mm_loadu_ps(mag + k);
|
||||
_mm_storeu_ps(x + k, _mm_mul_ps(_mm_loadu_ps(x + k), v_m));
|
||||
_mm_storeu_ps(y + k, _mm_mul_ps(_mm_loadu_ps(y + k), v_m));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for( ; k < len; k++ )
|
||||
@ -939,10 +1077,10 @@ void polarToCart( InputArray src1, InputArray src2,
|
||||
x[k] = buf[0][k]*m; y[k] = buf[1][k]*m;
|
||||
}
|
||||
else
|
||||
for( k = 0; k < len; k++ )
|
||||
{
|
||||
x[k] = buf[0][k]; y[k] = buf[1][k];
|
||||
}
|
||||
{
|
||||
std::memcpy(x, buf[0], sizeof(float) * len);
|
||||
std::memcpy(y, buf[1], sizeof(float) * len);
|
||||
}
|
||||
}
|
||||
|
||||
if( ptrs[0] )
|
||||
|
@ -12,6 +12,7 @@
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
|
@ -192,6 +192,7 @@ struct NoVec
|
||||
extern volatile bool USE_SSE2;
|
||||
extern volatile bool USE_SSE4_2;
|
||||
extern volatile bool USE_AVX;
|
||||
extern volatile bool USE_AVX2;
|
||||
|
||||
enum { BLOCK_SIZE = 1024 };
|
||||
|
||||
|
@ -12,6 +12,7 @@
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
@ -72,7 +73,114 @@ struct Sum_SIMD
|
||||
}
|
||||
};
|
||||
|
||||
#if CV_NEON
|
||||
#if CV_SSE2
|
||||
|
||||
template <>
|
||||
struct Sum_SIMD<schar, int>
|
||||
{
|
||||
int operator () (const schar * src0, const uchar * mask, int * dst, int len, int cn) const
|
||||
{
|
||||
if (mask || (cn != 1 && cn != 2 && cn != 4) || !USE_SSE2)
|
||||
return 0;
|
||||
|
||||
int x = 0;
|
||||
__m128i v_zero = _mm_setzero_si128(), v_sum = v_zero;
|
||||
|
||||
for ( ; x <= len - 16; x += 16)
|
||||
{
|
||||
__m128i v_src = _mm_loadu_si128((const __m128i *)(src0 + x));
|
||||
__m128i v_half = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src), 8);
|
||||
|
||||
v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_half), 16));
|
||||
v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_half), 16));
|
||||
|
||||
v_half = _mm_srai_epi16(_mm_unpackhi_epi8(v_zero, v_src), 8);
|
||||
v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_half), 16));
|
||||
v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_half), 16));
|
||||
}
|
||||
|
||||
for ( ; x <= len - 8; x += 8)
|
||||
{
|
||||
__m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src0 + x))), 8);
|
||||
|
||||
v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
|
||||
v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
|
||||
}
|
||||
|
||||
int CV_DECL_ALIGNED(16) ar[4];
|
||||
_mm_store_si128((__m128i*)ar, v_sum);
|
||||
|
||||
for (int i = 0; i < 4; i += cn)
|
||||
for (int j = 0; j < cn; ++j)
|
||||
dst[j] += ar[j + i];
|
||||
|
||||
return x / cn;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct Sum_SIMD<int, double>
|
||||
{
|
||||
int operator () (const int * src0, const uchar * mask, double * dst, int len, int cn) const
|
||||
{
|
||||
if (mask || (cn != 1 && cn != 2 && cn != 4) || !USE_SSE2)
|
||||
return 0;
|
||||
|
||||
int x = 0;
|
||||
__m128d v_zero = _mm_setzero_pd(), v_sum0 = v_zero, v_sum1 = v_zero;
|
||||
|
||||
for ( ; x <= len - 4; x += 4)
|
||||
{
|
||||
__m128i v_src = _mm_loadu_si128((__m128i const *)(src0 + x));
|
||||
v_sum0 = _mm_add_pd(v_sum0, _mm_cvtepi32_pd(v_src));
|
||||
v_sum1 = _mm_add_pd(v_sum1, _mm_cvtepi32_pd(_mm_srli_si128(v_src, 8)));
|
||||
}
|
||||
|
||||
double CV_DECL_ALIGNED(16) ar[4];
|
||||
_mm_store_pd(ar, v_sum0);
|
||||
_mm_store_pd(ar + 2, v_sum1);
|
||||
|
||||
for (int i = 0; i < 4; i += cn)
|
||||
for (int j = 0; j < cn; ++j)
|
||||
dst[j] += ar[j + i];
|
||||
|
||||
return x / cn;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct Sum_SIMD<float, double>
|
||||
{
|
||||
int operator () (const float * src0, const uchar * mask, double * dst, int len, int cn) const
|
||||
{
|
||||
if (mask || (cn != 1 && cn != 2 && cn != 4) || !USE_SSE2)
|
||||
return 0;
|
||||
|
||||
int x = 0;
|
||||
__m128d v_zero = _mm_setzero_pd(), v_sum0 = v_zero, v_sum1 = v_zero;
|
||||
|
||||
for ( ; x <= len - 4; x += 4)
|
||||
{
|
||||
__m128 v_src = _mm_loadu_ps(src0 + x);
|
||||
v_sum0 = _mm_add_pd(v_sum0, _mm_cvtps_pd(v_src));
|
||||
v_src = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8));
|
||||
v_sum1 = _mm_add_pd(v_sum1, _mm_cvtps_pd(v_src));
|
||||
}
|
||||
|
||||
double CV_DECL_ALIGNED(16) ar[4];
|
||||
_mm_store_pd(ar, v_sum0);
|
||||
_mm_store_pd(ar + 2, v_sum1);
|
||||
|
||||
for (int i = 0; i < 4; i += cn)
|
||||
for (int j = 0; j < cn; ++j)
|
||||
dst[j] += ar[j + i];
|
||||
|
||||
return x / cn;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
#elif CV_NEON
|
||||
|
||||
template <>
|
||||
struct Sum_SIMD<uchar, int>
|
||||
@ -396,6 +504,38 @@ static int countNonZero_(const T* src, int len )
|
||||
return nz;
|
||||
}
|
||||
|
||||
#if CV_SSE2
|
||||
|
||||
static const uchar * initPopcountTable()
|
||||
{
|
||||
static uchar tab[256];
|
||||
static volatile bool initialized = false;
|
||||
if( !initialized )
|
||||
{
|
||||
// we compute inverse popcount table,
|
||||
// since we pass (img[x] == 0) mask as index in the table.
|
||||
unsigned int j = 0u;
|
||||
#if CV_POPCNT
|
||||
if (checkHardwareSupport(CV_CPU_POPCNT))
|
||||
for( ; j < 256u; j++ )
|
||||
tab[j] = (uchar)(8 - _mm_popcnt_u32(j));
|
||||
#else
|
||||
for( ; j < 256u; j++ )
|
||||
{
|
||||
int val = 0;
|
||||
for( int mask = 1; mask < 256; mask += mask )
|
||||
val += (j & mask) == 0;
|
||||
tab[j] = (uchar)val;
|
||||
}
|
||||
#endif
|
||||
initialized = true;
|
||||
}
|
||||
|
||||
return tab;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
static int countNonZero8u( const uchar* src, int len )
|
||||
{
|
||||
int i=0, nz = 0;
|
||||
@ -403,21 +543,7 @@ static int countNonZero8u( const uchar* src, int len )
|
||||
if(USE_SSE2)//5x-6x
|
||||
{
|
||||
__m128i pattern = _mm_setzero_si128 ();
|
||||
static uchar tab[256];
|
||||
static volatile bool initialized = false;
|
||||
if( !initialized )
|
||||
{
|
||||
// we compute inverse popcount table,
|
||||
// since we pass (img[x] == 0) mask as index in the table.
|
||||
for( int j = 0; j < 256; j++ )
|
||||
{
|
||||
int val = 0;
|
||||
for( int mask = 1; mask < 256; mask += mask )
|
||||
val += (j & mask) == 0;
|
||||
tab[j] = (uchar)val;
|
||||
}
|
||||
initialized = true;
|
||||
}
|
||||
static const uchar * tab = initPopcountTable();
|
||||
|
||||
for (; i<=len-16; i+=16)
|
||||
{
|
||||
@ -467,7 +593,22 @@ static int countNonZero8u( const uchar* src, int len )
|
||||
static int countNonZero16u( const ushort* src, int len )
|
||||
{
|
||||
int i = 0, nz = 0;
|
||||
#if CV_NEON
|
||||
#if CV_SSE2
|
||||
if (USE_SSE2)
|
||||
{
|
||||
__m128i v_zero = _mm_setzero_si128 ();
|
||||
static const uchar * tab = initPopcountTable();
|
||||
|
||||
for ( ; i <= len - 8; i += 8)
|
||||
{
|
||||
__m128i v_src = _mm_loadu_si128((const __m128i*)(src + i));
|
||||
int val = _mm_movemask_epi8(_mm_packs_epi16(_mm_cmpeq_epi16(v_src, v_zero), v_zero));
|
||||
nz += tab[val];
|
||||
}
|
||||
|
||||
src += i;
|
||||
}
|
||||
#elif CV_NEON
|
||||
int len0 = len & -8, blockSize1 = (1 << 15), blockSize0 = blockSize1 << 6;
|
||||
uint32x4_t v_nz = vdupq_n_u32(0u);
|
||||
uint16x8_t v_zero = vdupq_n_u16(0), v_1 = vdupq_n_u16(1);
|
||||
@ -503,7 +644,27 @@ static int countNonZero16u( const ushort* src, int len )
|
||||
static int countNonZero32s( const int* src, int len )
|
||||
{
|
||||
int i = 0, nz = 0;
|
||||
#if CV_NEON
|
||||
#if CV_SSE2
|
||||
if (USE_SSE2)
|
||||
{
|
||||
__m128i v_zero = _mm_setzero_si128 ();
|
||||
static const uchar * tab = initPopcountTable();
|
||||
|
||||
for ( ; i <= len - 8; i += 8)
|
||||
{
|
||||
__m128i v_src = _mm_loadu_si128((const __m128i*)(src + i));
|
||||
__m128i v_dst0 = _mm_cmpeq_epi32(v_src, v_zero);
|
||||
|
||||
v_src = _mm_loadu_si128((const __m128i*)(src + i + 4));
|
||||
__m128i v_dst1 = _mm_cmpeq_epi32(v_src, v_zero);
|
||||
|
||||
int val = _mm_movemask_epi8(_mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_zero));
|
||||
nz += tab[val];
|
||||
}
|
||||
|
||||
src += i;
|
||||
}
|
||||
#elif CV_NEON
|
||||
int len0 = len & -8, blockSize1 = (1 << 15), blockSize0 = blockSize1 << 6;
|
||||
uint32x4_t v_nz = vdupq_n_u32(0u);
|
||||
int32x4_t v_zero = vdupq_n_s32(0.0f);
|
||||
@ -541,7 +702,25 @@ static int countNonZero32s( const int* src, int len )
|
||||
static int countNonZero32f( const float* src, int len )
|
||||
{
|
||||
int i = 0, nz = 0;
|
||||
#if CV_NEON
|
||||
#if CV_SSE2
|
||||
if (USE_SSE2)
|
||||
{
|
||||
__m128i v_zero_i = _mm_setzero_si128();
|
||||
__m128 v_zero_f = _mm_setzero_ps();
|
||||
static const uchar * tab = initPopcountTable();
|
||||
|
||||
for ( ; i <= len - 8; i += 8)
|
||||
{
|
||||
__m128i v_dst0 = _mm_castps_si128(_mm_cmpeq_ps(_mm_loadu_ps(src + i), v_zero_f));
|
||||
__m128i v_dst1 = _mm_castps_si128(_mm_cmpeq_ps(_mm_loadu_ps(src + i + 4), v_zero_f));
|
||||
|
||||
int val = _mm_movemask_epi8(_mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_zero_i));
|
||||
nz += tab[val];
|
||||
}
|
||||
|
||||
src += i;
|
||||
}
|
||||
#elif CV_NEON
|
||||
int len0 = len & -8, blockSize1 = (1 << 15), blockSize0 = blockSize1 << 6;
|
||||
uint32x4_t v_nz = vdupq_n_u32(0u);
|
||||
float32x4_t v_zero = vdupq_n_f32(0.0f);
|
||||
@ -577,7 +756,34 @@ static int countNonZero32f( const float* src, int len )
|
||||
}
|
||||
|
||||
static int countNonZero64f( const double* src, int len )
|
||||
{ return countNonZero_(src, len); }
|
||||
{
|
||||
int i = 0, nz = 0;
|
||||
#if CV_SSE2
|
||||
if (USE_SSE2)
|
||||
{
|
||||
__m128i v_zero_i = _mm_setzero_si128();
|
||||
__m128d v_zero_d = _mm_setzero_pd();
|
||||
static const uchar * tab = initPopcountTable();
|
||||
|
||||
for ( ; i <= len - 8; i += 8)
|
||||
{
|
||||
__m128i v_dst0 = _mm_castpd_si128(_mm_cmpeq_pd(_mm_loadu_pd(src + i), v_zero_d));
|
||||
__m128i v_dst1 = _mm_castpd_si128(_mm_cmpeq_pd(_mm_loadu_pd(src + i + 2), v_zero_d));
|
||||
__m128i v_dst2 = _mm_castpd_si128(_mm_cmpeq_pd(_mm_loadu_pd(src + i + 4), v_zero_d));
|
||||
__m128i v_dst3 = _mm_castpd_si128(_mm_cmpeq_pd(_mm_loadu_pd(src + i + 6), v_zero_d));
|
||||
|
||||
v_dst0 = _mm_packs_epi32(v_dst0, v_dst1);
|
||||
v_dst1 = _mm_packs_epi32(v_dst2, v_dst3);
|
||||
|
||||
int val = _mm_movemask_epi8(_mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_zero_i));
|
||||
nz += tab[val];
|
||||
}
|
||||
|
||||
src += i;
|
||||
}
|
||||
#endif
|
||||
return nz + countNonZero_(src, len - i);
|
||||
}
|
||||
|
||||
typedef int (*CountNonZeroFunc)(const uchar*, int);
|
||||
|
||||
@ -594,6 +800,137 @@ static CountNonZeroFunc getCountNonZeroTab(int depth)
|
||||
return countNonZeroTab[depth];
|
||||
}
|
||||
|
||||
template <typename T, typename ST, typename SQT>
|
||||
struct SumSqr_SIMD
|
||||
{
|
||||
int operator () (const T *, const uchar *, ST *, SQT *, int, int) const
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
|
||||
#if CV_SSE2
|
||||
|
||||
template <>
|
||||
struct SumSqr_SIMD<uchar, int, int>
|
||||
{
|
||||
int operator () (const uchar * src0, const uchar * mask, int * sum, int * sqsum, int len, int cn) const
|
||||
{
|
||||
if (mask || (cn != 1 && cn != 2) || !USE_SSE2)
|
||||
return 0;
|
||||
|
||||
int x = 0;
|
||||
__m128i v_zero = _mm_setzero_si128(), v_sum = v_zero, v_sqsum = v_zero;
|
||||
|
||||
for ( ; x <= len - 16; x += 16)
|
||||
{
|
||||
__m128i v_src = _mm_loadu_si128((const __m128i *)(src0 + x));
|
||||
__m128i v_half = _mm_unpacklo_epi8(v_src, v_zero);
|
||||
|
||||
__m128i v_mullo = _mm_mullo_epi16(v_half, v_half);
|
||||
__m128i v_mulhi = _mm_mulhi_epi16(v_half, v_half);
|
||||
v_sum = _mm_add_epi32(v_sum, _mm_unpacklo_epi16(v_half, v_zero));
|
||||
v_sum = _mm_add_epi32(v_sum, _mm_unpackhi_epi16(v_half, v_zero));
|
||||
v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpacklo_epi16(v_mullo, v_mulhi));
|
||||
v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpackhi_epi16(v_mullo, v_mulhi));
|
||||
|
||||
v_half = _mm_unpackhi_epi8(v_src, v_zero);
|
||||
v_mullo = _mm_mullo_epi16(v_half, v_half);
|
||||
v_mulhi = _mm_mulhi_epi16(v_half, v_half);
|
||||
v_sum = _mm_add_epi32(v_sum, _mm_unpacklo_epi16(v_half, v_zero));
|
||||
v_sum = _mm_add_epi32(v_sum, _mm_unpackhi_epi16(v_half, v_zero));
|
||||
v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpacklo_epi16(v_mullo, v_mulhi));
|
||||
v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpackhi_epi16(v_mullo, v_mulhi));
|
||||
}
|
||||
|
||||
for ( ; x <= len - 8; x += 8)
|
||||
{
|
||||
__m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src0 + x)), v_zero);
|
||||
|
||||
__m128i v_mullo = _mm_mullo_epi16(v_src, v_src);
|
||||
__m128i v_mulhi = _mm_mulhi_epi16(v_src, v_src);
|
||||
v_sum = _mm_add_epi32(v_sum, _mm_unpacklo_epi16(v_src, v_zero));
|
||||
v_sum = _mm_add_epi32(v_sum, _mm_unpackhi_epi16(v_src, v_zero));
|
||||
v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpacklo_epi16(v_mullo, v_mulhi));
|
||||
v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpackhi_epi16(v_mullo, v_mulhi));
|
||||
}
|
||||
|
||||
int CV_DECL_ALIGNED(16) ar[8];
|
||||
_mm_store_si128((__m128i*)ar, v_sum);
|
||||
_mm_store_si128((__m128i*)(ar + 4), v_sqsum);
|
||||
|
||||
for (int i = 0; i < 4; i += cn)
|
||||
for (int j = 0; j < cn; ++j)
|
||||
{
|
||||
sum[j] += ar[j + i];
|
||||
sqsum[j] += ar[4 + j + i];
|
||||
}
|
||||
|
||||
return x / cn;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct SumSqr_SIMD<schar, int, int>
|
||||
{
|
||||
int operator () (const schar * src0, const uchar * mask, int * sum, int * sqsum, int len, int cn) const
|
||||
{
|
||||
if (mask || (cn != 1 && cn != 2) || !USE_SSE2)
|
||||
return 0;
|
||||
|
||||
int x = 0;
|
||||
__m128i v_zero = _mm_setzero_si128(), v_sum = v_zero, v_sqsum = v_zero;
|
||||
|
||||
for ( ; x <= len - 16; x += 16)
|
||||
{
|
||||
__m128i v_src = _mm_loadu_si128((const __m128i *)(src0 + x));
|
||||
__m128i v_half = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src), 8);
|
||||
|
||||
__m128i v_mullo = _mm_mullo_epi16(v_half, v_half);
|
||||
__m128i v_mulhi = _mm_mulhi_epi16(v_half, v_half);
|
||||
v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_half), 16));
|
||||
v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_half), 16));
|
||||
v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpacklo_epi16(v_mullo, v_mulhi));
|
||||
v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpackhi_epi16(v_mullo, v_mulhi));
|
||||
|
||||
v_half = _mm_srai_epi16(_mm_unpackhi_epi8(v_zero, v_src), 8);
|
||||
v_mullo = _mm_mullo_epi16(v_half, v_half);
|
||||
v_mulhi = _mm_mulhi_epi16(v_half, v_half);
|
||||
v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_half), 16));
|
||||
v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_half), 16));
|
||||
v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpacklo_epi16(v_mullo, v_mulhi));
|
||||
v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpackhi_epi16(v_mullo, v_mulhi));
|
||||
}
|
||||
|
||||
for ( ; x <= len - 8; x += 8)
|
||||
{
|
||||
__m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src0 + x))), 8);
|
||||
|
||||
__m128i v_mullo = _mm_mullo_epi16(v_src, v_src);
|
||||
__m128i v_mulhi = _mm_mulhi_epi16(v_src, v_src);
|
||||
v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
|
||||
v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
|
||||
v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpacklo_epi16(v_mullo, v_mulhi));
|
||||
v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpackhi_epi16(v_mullo, v_mulhi));
|
||||
}
|
||||
|
||||
int CV_DECL_ALIGNED(16) ar[8];
|
||||
_mm_store_si128((__m128i*)ar, v_sum);
|
||||
_mm_store_si128((__m128i*)(ar + 4), v_sqsum);
|
||||
|
||||
for (int i = 0; i < 4; i += cn)
|
||||
for (int j = 0; j < cn; ++j)
|
||||
{
|
||||
sum[j] += ar[j + i];
|
||||
sqsum[j] += ar[4 + j + i];
|
||||
}
|
||||
|
||||
return x / cn;
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
template<typename T, typename ST, typename SQT>
|
||||
static int sumsqr_(const T* src0, const uchar* mask, ST* sum, SQT* sqsum, int len, int cn )
|
||||
{
|
||||
@ -601,14 +938,15 @@ static int sumsqr_(const T* src0, const uchar* mask, ST* sum, SQT* sqsum, int le
|
||||
|
||||
if( !mask )
|
||||
{
|
||||
int i;
|
||||
int k = cn % 4;
|
||||
SumSqr_SIMD<T, ST, SQT> vop;
|
||||
int i = vop(src0, mask, sum, sqsum, len, cn), k = cn % 4;
|
||||
src += i * cn;
|
||||
|
||||
if( k == 1 )
|
||||
{
|
||||
ST s0 = sum[0];
|
||||
SQT sq0 = sqsum[0];
|
||||
for( i = 0; i < len; i++, src += cn )
|
||||
for( ; i < len; i++, src += cn )
|
||||
{
|
||||
T v = src[0];
|
||||
s0 += v; sq0 += (SQT)v*v;
|
||||
@ -620,7 +958,7 @@ static int sumsqr_(const T* src0, const uchar* mask, ST* sum, SQT* sqsum, int le
|
||||
{
|
||||
ST s0 = sum[0], s1 = sum[1];
|
||||
SQT sq0 = sqsum[0], sq1 = sqsum[1];
|
||||
for( i = 0; i < len; i++, src += cn )
|
||||
for( ; i < len; i++, src += cn )
|
||||
{
|
||||
T v0 = src[0], v1 = src[1];
|
||||
s0 += v0; sq0 += (SQT)v0*v0;
|
||||
@ -633,7 +971,7 @@ static int sumsqr_(const T* src0, const uchar* mask, ST* sum, SQT* sqsum, int le
|
||||
{
|
||||
ST s0 = sum[0], s1 = sum[1], s2 = sum[2];
|
||||
SQT sq0 = sqsum[0], sq1 = sqsum[1], sq2 = sqsum[2];
|
||||
for( i = 0; i < len; i++, src += cn )
|
||||
for( ; i < len; i++, src += cn )
|
||||
{
|
||||
T v0 = src[0], v1 = src[1], v2 = src[2];
|
||||
s0 += v0; sq0 += (SQT)v0*v0;
|
||||
@ -649,7 +987,7 @@ static int sumsqr_(const T* src0, const uchar* mask, ST* sum, SQT* sqsum, int le
|
||||
src = src0 + k;
|
||||
ST s0 = sum[k], s1 = sum[k+1], s2 = sum[k+2], s3 = sum[k+3];
|
||||
SQT sq0 = sqsum[k], sq1 = sqsum[k+1], sq2 = sqsum[k+2], sq3 = sqsum[k+3];
|
||||
for( i = 0; i < len; i++, src += cn )
|
||||
for( ; i < len; i++, src += cn )
|
||||
{
|
||||
T v0, v1;
|
||||
v0 = src[0], v1 = src[1];
|
||||
@ -924,7 +1262,6 @@ cv::Scalar cv::sum( InputArray _src )
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
SumFunc func = getSumFunc(depth);
|
||||
|
||||
CV_Assert( cn <= 4 && func != 0 );
|
||||
|
@ -12,6 +12,7 @@
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2015, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
@ -89,6 +90,22 @@
|
||||
pop ebx
|
||||
}
|
||||
}
|
||||
static void __cpuidex(int* cpuid_data, int, int)
|
||||
{
|
||||
__asm
|
||||
{
|
||||
push edi
|
||||
mov edi, cpuid_data
|
||||
mov eax, 7
|
||||
mov ecx, 0
|
||||
cpuid
|
||||
mov [edi], eax
|
||||
mov [edi + 4], ebx
|
||||
mov [edi + 8], ecx
|
||||
mov [edi + 12], edx
|
||||
pop edi
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@ -208,7 +225,7 @@ struct HWFeatures
|
||||
enum { MAX_FEATURE = CV_HARDWARE_MAX_FEATURE };
|
||||
|
||||
HWFeatures(void)
|
||||
{
|
||||
{
|
||||
memset( have, 0, sizeof(have) );
|
||||
x86_family = 0;
|
||||
}
|
||||
@ -252,10 +269,54 @@ struct HWFeatures
|
||||
f.have[CV_CPU_SSE2] = (cpuid_data[3] & (1<<26)) != 0;
|
||||
f.have[CV_CPU_SSE3] = (cpuid_data[2] & (1<<0)) != 0;
|
||||
f.have[CV_CPU_SSSE3] = (cpuid_data[2] & (1<<9)) != 0;
|
||||
f.have[CV_CPU_FMA3] = (cpuid_data[2] & (1<<12)) != 0;
|
||||
f.have[CV_CPU_SSE4_1] = (cpuid_data[2] & (1<<19)) != 0;
|
||||
f.have[CV_CPU_SSE4_2] = (cpuid_data[2] & (1<<20)) != 0;
|
||||
f.have[CV_CPU_POPCNT] = (cpuid_data[2] & (1<<23)) != 0;
|
||||
f.have[CV_CPU_AVX] = (((cpuid_data[2] & (1<<28)) != 0)&&((cpuid_data[2] & (1<<27)) != 0));//OS uses XSAVE_XRSTORE and CPU support AVX
|
||||
|
||||
// make the second call to the cpuid command in order to get
|
||||
// information about extended features like AVX2
|
||||
#if defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
|
||||
__cpuidex(cpuid_data, 7, 0);
|
||||
#elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
|
||||
#ifdef __x86_64__
|
||||
asm __volatile__
|
||||
(
|
||||
"movl $7, %%eax\n\t"
|
||||
"movl $0, %%ecx\n\t"
|
||||
"cpuid\n\t"
|
||||
:[eax]"=a"(cpuid_data[0]),[ebx]"=b"(cpuid_data[1]),[ecx]"=c"(cpuid_data[2]),[edx]"=d"(cpuid_data[3])
|
||||
:
|
||||
: "cc"
|
||||
);
|
||||
#else
|
||||
asm volatile
|
||||
(
|
||||
"pushl %%eax\n\t"
|
||||
"pushl %%edx\n\t"
|
||||
"movl $7,%%eax\n\t"
|
||||
"movl $0,%%ecx\n\t"
|
||||
"cpuid\n\t"
|
||||
"popl %%edx\n\t"
|
||||
"popl %%eax\n\t"
|
||||
: "=b"(cpuid_data[1]), "=c"(cpuid_data[2])
|
||||
:
|
||||
: "cc"
|
||||
);
|
||||
#endif
|
||||
#endif
|
||||
f.have[CV_CPU_AVX2] = (cpuid_data[1] & (1<<5)) != 0;
|
||||
|
||||
f.have[CV_CPU_AVX_512F] = (cpuid_data[1] & (1<<16)) != 0;
|
||||
f.have[CV_CPU_AVX_512DQ] = (cpuid_data[1] & (1<<17)) != 0;
|
||||
f.have[CV_CPU_AVX_512IFMA512] = (cpuid_data[1] & (1<<21)) != 0;
|
||||
f.have[CV_CPU_AVX_512PF] = (cpuid_data[1] & (1<<26)) != 0;
|
||||
f.have[CV_CPU_AVX_512ER] = (cpuid_data[1] & (1<<27)) != 0;
|
||||
f.have[CV_CPU_AVX_512CD] = (cpuid_data[1] & (1<<28)) != 0;
|
||||
f.have[CV_CPU_AVX_512BW] = (cpuid_data[1] & (1<<30)) != 0;
|
||||
f.have[CV_CPU_AVX_512VL] = (cpuid_data[1] & (1<<31)) != 0;
|
||||
f.have[CV_CPU_AVX_512VBMI] = (cpuid_data[2] & (1<<1)) != 0;
|
||||
}
|
||||
|
||||
#if defined ANDROID || defined __linux__
|
||||
@ -318,6 +379,7 @@ IPPInitializer ippInitializer;
|
||||
volatile bool USE_SSE2 = featuresEnabled.have[CV_CPU_SSE2];
|
||||
volatile bool USE_SSE4_2 = featuresEnabled.have[CV_CPU_SSE4_2];
|
||||
volatile bool USE_AVX = featuresEnabled.have[CV_CPU_AVX];
|
||||
volatile bool USE_AVX2 = featuresEnabled.have[CV_CPU_AVX2];
|
||||
|
||||
void setUseOptimized( bool flag )
|
||||
{
|
||||
|
@ -10,8 +10,7 @@
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2014, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
|
@ -1577,7 +1577,7 @@ PARAM_TEST_CASE(ConvertScaleAbs, MatDepth, Channels, bool)
|
||||
|
||||
Size roiSize = randomSize(1, MAX_VALUE);
|
||||
Border srcBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
|
||||
randomSubMat(src, src_roi, roiSize, srcBorder, stype, 2, 11); // FIXIT: Test with minV, maxV
|
||||
randomSubMat(src, src_roi, roiSize, srcBorder, stype, -11, 11); // FIXIT: Test with minV, maxV
|
||||
|
||||
Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
|
||||
randomSubMat(dst, dst_roi, roiSize, dstBorder, dtype, 5, 16);
|
||||
|
@ -12,6 +12,7 @@
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2014, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
/
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
|
@ -11,6 +11,7 @@
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2014, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
|
@ -11,6 +11,7 @@
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2013, NVIDIA Corporation, all rights reserved.
|
||||
// Copyright (C) 2014, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -12,6 +12,7 @@
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
@ -270,6 +271,8 @@ cornerEigenValsVecs( const Mat& src, Mat& eigenv, int block_size,
|
||||
#ifdef HAVE_TEGRA_OPTIMIZATION
|
||||
if (tegra::cornerEigenValsVecs(src, eigenv, block_size, aperture_size, op_type, k, borderType))
|
||||
return;
|
||||
#elif CV_SSE2
|
||||
bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
|
||||
#endif
|
||||
|
||||
int depth = src.depth();
|
||||
@ -318,6 +321,33 @@ cornerEigenValsVecs( const Mat& src, Mat& eigenv, int block_size,
|
||||
|
||||
vst3q_f32(cov_data + j * 3, v_dst);
|
||||
}
|
||||
#elif CV_SSE2
|
||||
if (haveSSE2)
|
||||
{
|
||||
for( ; j <= size.width - 8; j += 8 )
|
||||
{
|
||||
__m128 v_dx_0 = _mm_loadu_ps(dxdata + j);
|
||||
__m128 v_dx_1 = _mm_loadu_ps(dxdata + j + 4);
|
||||
__m128 v_dy_0 = _mm_loadu_ps(dydata + j);
|
||||
__m128 v_dy_1 = _mm_loadu_ps(dydata + j + 4);
|
||||
|
||||
__m128 v_dx2_0 = _mm_mul_ps(v_dx_0, v_dx_0);
|
||||
__m128 v_dxy_0 = _mm_mul_ps(v_dx_0, v_dy_0);
|
||||
__m128 v_dy2_0 = _mm_mul_ps(v_dy_0, v_dy_0);
|
||||
__m128 v_dx2_1 = _mm_mul_ps(v_dx_1, v_dx_1);
|
||||
__m128 v_dxy_1 = _mm_mul_ps(v_dx_1, v_dy_1);
|
||||
__m128 v_dy2_1 = _mm_mul_ps(v_dy_1, v_dy_1);
|
||||
|
||||
_mm_interleave_ps(v_dx2_0, v_dx2_1, v_dxy_0, v_dxy_1, v_dy2_0, v_dy2_1);
|
||||
|
||||
_mm_storeu_ps(cov_data + j * 3, v_dx2_0);
|
||||
_mm_storeu_ps(cov_data + j * 3 + 4, v_dx2_1);
|
||||
_mm_storeu_ps(cov_data + j * 3 + 8, v_dxy_0);
|
||||
_mm_storeu_ps(cov_data + j * 3 + 12, v_dxy_1);
|
||||
_mm_storeu_ps(cov_data + j * 3 + 16, v_dy2_0);
|
||||
_mm_storeu_ps(cov_data + j * 3 + 20, v_dy2_1);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for( ; j < size.width; j++ )
|
||||
|
@ -12,6 +12,7 @@
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009-2010, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2014, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
|
@ -2284,15 +2284,20 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method )
|
||||
|
||||
CV_Assert( it.planes[0].isContinuous() && it.planes[1].isContinuous() );
|
||||
|
||||
#if CV_SSE2
|
||||
bool haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
|
||||
#endif
|
||||
|
||||
for( size_t i = 0; i < it.nplanes; i++, ++it )
|
||||
{
|
||||
const float* h1 = it.planes[0].ptr<float>();
|
||||
const float* h2 = it.planes[1].ptr<float>();
|
||||
len = it.planes[0].rows*it.planes[0].cols*H1.channels();
|
||||
j = 0;
|
||||
|
||||
if( (method == CV_COMP_CHISQR) || (method == CV_COMP_CHISQR_ALT))
|
||||
{
|
||||
for( j = 0; j < len; j++ )
|
||||
for( ; j < len; j++ )
|
||||
{
|
||||
double a = h1[j] - h2[j];
|
||||
double b = (method == CV_COMP_CHISQR) ? h1[j] : h1[j] + h2[j];
|
||||
@ -2302,7 +2307,51 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method )
|
||||
}
|
||||
else if( method == CV_COMP_CORREL )
|
||||
{
|
||||
for( j = 0; j < len; j++ )
|
||||
#if CV_SSE2
|
||||
if (haveSIMD)
|
||||
{
|
||||
__m128d v_s1 = _mm_setzero_pd(), v_s2 = v_s1;
|
||||
__m128d v_s11 = v_s1, v_s22 = v_s1, v_s12 = v_s1;
|
||||
|
||||
for ( ; j <= len - 4; j += 4)
|
||||
{
|
||||
__m128 v_a = _mm_loadu_ps(h1 + j);
|
||||
__m128 v_b = _mm_loadu_ps(h2 + j);
|
||||
|
||||
// 0-1
|
||||
__m128d v_ad = _mm_cvtps_pd(v_a);
|
||||
__m128d v_bd = _mm_cvtps_pd(v_b);
|
||||
v_s12 = _mm_add_pd(v_s12, _mm_mul_pd(v_ad, v_bd));
|
||||
v_s11 = _mm_add_pd(v_s11, _mm_mul_pd(v_ad, v_ad));
|
||||
v_s22 = _mm_add_pd(v_s22, _mm_mul_pd(v_bd, v_bd));
|
||||
v_s1 = _mm_add_pd(v_s1, v_ad);
|
||||
v_s2 = _mm_add_pd(v_s2, v_bd);
|
||||
|
||||
// 2-3
|
||||
v_ad = _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_a), 8)));
|
||||
v_bd = _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_b), 8)));
|
||||
v_s12 = _mm_add_pd(v_s12, _mm_mul_pd(v_ad, v_bd));
|
||||
v_s11 = _mm_add_pd(v_s11, _mm_mul_pd(v_ad, v_ad));
|
||||
v_s22 = _mm_add_pd(v_s22, _mm_mul_pd(v_bd, v_bd));
|
||||
v_s1 = _mm_add_pd(v_s1, v_ad);
|
||||
v_s2 = _mm_add_pd(v_s2, v_bd);
|
||||
}
|
||||
|
||||
double CV_DECL_ALIGNED(16) ar[10];
|
||||
_mm_store_pd(ar, v_s12);
|
||||
_mm_store_pd(ar + 2, v_s11);
|
||||
_mm_store_pd(ar + 4, v_s22);
|
||||
_mm_store_pd(ar + 6, v_s1);
|
||||
_mm_store_pd(ar + 8, v_s2);
|
||||
|
||||
s12 += ar[0] + ar[1];
|
||||
s11 += ar[2] + ar[3];
|
||||
s22 += ar[4] + ar[5];
|
||||
s1 += ar[6] + ar[7];
|
||||
s2 += ar[8] + ar[9];
|
||||
}
|
||||
#endif
|
||||
for( ; j < len; j++ )
|
||||
{
|
||||
double a = h1[j];
|
||||
double b = h2[j];
|
||||
@ -2316,7 +2365,6 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method )
|
||||
}
|
||||
else if( method == CV_COMP_INTERSECT )
|
||||
{
|
||||
j = 0;
|
||||
#if CV_NEON
|
||||
float32x4_t v_result = vdupq_n_f32(0.0f);
|
||||
for( ; j <= len - 4; j += 4 )
|
||||
@ -2324,13 +2372,61 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method )
|
||||
float CV_DECL_ALIGNED(16) ar[4];
|
||||
vst1q_f32(ar, v_result);
|
||||
result += ar[0] + ar[1] + ar[2] + ar[3];
|
||||
#elif CV_SSE2
|
||||
if (haveSIMD)
|
||||
{
|
||||
__m128d v_result = _mm_setzero_pd();
|
||||
for ( ; j <= len - 4; j += 4)
|
||||
{
|
||||
__m128 v_src = _mm_min_ps(_mm_loadu_ps(h1 + j),
|
||||
_mm_loadu_ps(h2 + j));
|
||||
v_result = _mm_add_pd(v_result, _mm_cvtps_pd(v_src));
|
||||
v_src = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8));
|
||||
v_result = _mm_add_pd(v_result, _mm_cvtps_pd(v_src));
|
||||
}
|
||||
|
||||
double CV_DECL_ALIGNED(16) ar[2];
|
||||
_mm_store_pd(ar, v_result);
|
||||
result += ar[0] + ar[1];
|
||||
}
|
||||
#endif
|
||||
for( ; j < len; j++ )
|
||||
result += std::min(h1[j], h2[j]);
|
||||
}
|
||||
else if( method == CV_COMP_BHATTACHARYYA )
|
||||
{
|
||||
for( j = 0; j < len; j++ )
|
||||
#if CV_SSE2
|
||||
if (haveSIMD)
|
||||
{
|
||||
__m128d v_s1 = _mm_setzero_pd(), v_s2 = v_s1, v_result = v_s1;
|
||||
for ( ; j <= len - 4; j += 4)
|
||||
{
|
||||
__m128 v_a = _mm_loadu_ps(h1 + j);
|
||||
__m128 v_b = _mm_loadu_ps(h2 + j);
|
||||
|
||||
__m128d v_ad = _mm_cvtps_pd(v_a);
|
||||
__m128d v_bd = _mm_cvtps_pd(v_b);
|
||||
v_s1 = _mm_add_pd(v_s1, v_ad);
|
||||
v_s2 = _mm_add_pd(v_s2, v_bd);
|
||||
v_result = _mm_add_pd(v_result, _mm_sqrt_pd(_mm_mul_pd(v_ad, v_bd)));
|
||||
|
||||
v_ad = _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_a), 8)));
|
||||
v_bd = _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_b), 8)));
|
||||
v_s1 = _mm_add_pd(v_s1, v_ad);
|
||||
v_s2 = _mm_add_pd(v_s2, v_bd);
|
||||
v_result = _mm_add_pd(v_result, _mm_sqrt_pd(_mm_mul_pd(v_ad, v_bd)));
|
||||
}
|
||||
|
||||
double CV_DECL_ALIGNED(16) ar[6];
|
||||
_mm_store_pd(ar, v_s1);
|
||||
_mm_store_pd(ar + 2, v_s2);
|
||||
_mm_store_pd(ar + 4, v_result);
|
||||
s1 += ar[0] + ar[1];
|
||||
s2 += ar[2] + ar[3];
|
||||
result += ar[4] + ar[5];
|
||||
}
|
||||
#endif
|
||||
for( ; j < len; j++ )
|
||||
{
|
||||
double a = h1[j];
|
||||
double b = h2[j];
|
||||
@ -2341,7 +2437,7 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method )
|
||||
}
|
||||
else if( method == CV_COMP_KL_DIV )
|
||||
{
|
||||
for( j = 0; j < len; j++ )
|
||||
for( ; j < len; j++ )
|
||||
{
|
||||
double p = h1[j];
|
||||
double q = h2[j];
|
||||
|
@ -12,6 +12,7 @@
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
@ -1962,9 +1963,9 @@ private:
|
||||
struct ResizeAreaFastVec_SIMD_32f
|
||||
{
|
||||
ResizeAreaFastVec_SIMD_32f(int _scale_x, int _scale_y, int _cn, int _step) :
|
||||
scale_x(_scale_x), scale_y(_scale_y), cn(_cn), step(_step)
|
||||
cn(_cn), step(_step)
|
||||
{
|
||||
fast_mode = scale_x == 2 && scale_y == 2 && (cn == 1 || cn == 3 || cn == 4);
|
||||
fast_mode = _scale_x == 2 && _scale_y == 2 && (cn == 1 || cn == 4);
|
||||
}
|
||||
|
||||
int operator() (const float * S, float * D, int w) const
|
||||
@ -2004,7 +2005,6 @@ struct ResizeAreaFastVec_SIMD_32f
|
||||
}
|
||||
|
||||
private:
|
||||
int scale_x, scale_y;
|
||||
int cn;
|
||||
bool fast_mode;
|
||||
int step;
|
||||
@ -2199,8 +2199,146 @@ private:
|
||||
bool use_simd;
|
||||
};
|
||||
|
||||
typedef ResizeAreaFastNoVec<short, short> ResizeAreaFastVec_SIMD_16s;
|
||||
typedef ResizeAreaFastNoVec<float, float> ResizeAreaFastVec_SIMD_32f;
|
||||
class ResizeAreaFastVec_SIMD_16s
|
||||
{
|
||||
public:
|
||||
ResizeAreaFastVec_SIMD_16s(int _cn, int _step) :
|
||||
cn(_cn), step(_step)
|
||||
{
|
||||
use_simd = checkHardwareSupport(CV_CPU_SSE2);
|
||||
}
|
||||
|
||||
int operator() (const short* S, short* D, int w) const
|
||||
{
|
||||
if (!use_simd)
|
||||
return 0;
|
||||
|
||||
int dx = 0;
|
||||
const short* S0 = (const short*)S;
|
||||
const short* S1 = (const short*)((const uchar*)(S) + step);
|
||||
__m128i masklow = _mm_set1_epi32(0x0000ffff);
|
||||
__m128i zero = _mm_setzero_si128();
|
||||
__m128i delta2 = _mm_set1_epi32(2);
|
||||
|
||||
if (cn == 1)
|
||||
{
|
||||
for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
|
||||
{
|
||||
__m128i r0 = _mm_loadu_si128((const __m128i*)S0);
|
||||
__m128i r1 = _mm_loadu_si128((const __m128i*)S1);
|
||||
|
||||
__m128i s0 = _mm_add_epi32(_mm_srai_epi32(r0, 16),
|
||||
_mm_srai_epi32(_mm_slli_epi32(_mm_and_si128(r0, masklow), 16), 16));
|
||||
__m128i s1 = _mm_add_epi32(_mm_srai_epi32(r1, 16),
|
||||
_mm_srai_epi32(_mm_slli_epi32(_mm_and_si128(r1, masklow), 16), 16));
|
||||
s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), delta2);
|
||||
s0 = _mm_srai_epi32(s0, 2);
|
||||
s0 = _mm_packs_epi32(s0, zero);
|
||||
|
||||
_mm_storel_epi64((__m128i*)D, s0);
|
||||
}
|
||||
}
|
||||
else if (cn == 3)
|
||||
for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3)
|
||||
{
|
||||
__m128i r0 = _mm_loadu_si128((const __m128i*)S0);
|
||||
__m128i r1 = _mm_loadu_si128((const __m128i*)S1);
|
||||
|
||||
__m128i r0_16l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r0), 16);
|
||||
__m128i r0_16h = _mm_srai_epi32(_mm_unpacklo_epi16(zero, _mm_srli_si128(r0, 6)), 16);
|
||||
__m128i r1_16l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r1), 16);
|
||||
__m128i r1_16h = _mm_srai_epi32(_mm_unpacklo_epi16(zero, _mm_srli_si128(r1, 6)), 16);
|
||||
|
||||
__m128i s0 = _mm_add_epi32(r0_16l, r0_16h);
|
||||
__m128i s1 = _mm_add_epi32(r1_16l, r1_16h);
|
||||
s0 = _mm_add_epi32(delta2, _mm_add_epi32(s0, s1));
|
||||
s0 = _mm_packs_epi32(_mm_srai_epi32(s0, 2), zero);
|
||||
_mm_storel_epi64((__m128i*)D, s0);
|
||||
}
|
||||
else
|
||||
{
|
||||
CV_Assert(cn == 4);
|
||||
for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
|
||||
{
|
||||
__m128i r0 = _mm_loadu_si128((const __m128i*)S0);
|
||||
__m128i r1 = _mm_loadu_si128((const __m128i*)S1);
|
||||
|
||||
__m128i r0_32l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r0), 16);
|
||||
__m128i r0_32h = _mm_srai_epi32(_mm_unpackhi_epi16(zero, r0), 16);
|
||||
__m128i r1_32l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r1), 16);
|
||||
__m128i r1_32h = _mm_srai_epi32(_mm_unpackhi_epi16(zero, r1), 16);
|
||||
|
||||
__m128i s0 = _mm_add_epi32(r0_32l, r0_32h);
|
||||
__m128i s1 = _mm_add_epi32(r1_32l, r1_32h);
|
||||
s0 = _mm_add_epi32(s1, _mm_add_epi32(s0, delta2));
|
||||
s0 = _mm_packs_epi32(_mm_srai_epi32(s0, 2), zero);
|
||||
_mm_storel_epi64((__m128i*)D, s0);
|
||||
}
|
||||
}
|
||||
|
||||
return dx;
|
||||
}
|
||||
|
||||
private:
|
||||
int cn;
|
||||
int step;
|
||||
bool use_simd;
|
||||
};
|
||||
|
||||
struct ResizeAreaFastVec_SIMD_32f
|
||||
{
|
||||
ResizeAreaFastVec_SIMD_32f(int _scale_x, int _scale_y, int _cn, int _step) :
|
||||
cn(_cn), step(_step)
|
||||
{
|
||||
fast_mode = _scale_x == 2 && _scale_y == 2 && (cn == 1 || cn == 4);
|
||||
fast_mode = fast_mode && checkHardwareSupport(CV_CPU_SSE2);
|
||||
}
|
||||
|
||||
int operator() (const float * S, float * D, int w) const
|
||||
{
|
||||
if (!fast_mode)
|
||||
return 0;
|
||||
|
||||
const float * S0 = S, * S1 = (const float *)((const uchar *)(S0) + step);
|
||||
int dx = 0;
|
||||
|
||||
__m128 v_025 = _mm_set1_ps(0.25f);
|
||||
|
||||
if (cn == 1)
|
||||
{
|
||||
const int shuffle_lo = _MM_SHUFFLE(2, 0, 2, 0), shuffle_hi = _MM_SHUFFLE(3, 1, 3, 1);
|
||||
for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
|
||||
{
|
||||
__m128 v_row00 = _mm_loadu_ps(S0), v_row01 = _mm_loadu_ps(S0 + 4),
|
||||
v_row10 = _mm_loadu_ps(S1), v_row11 = _mm_loadu_ps(S1 + 4);
|
||||
|
||||
__m128 v_dst0 = _mm_add_ps(_mm_shuffle_ps(v_row00, v_row01, shuffle_lo),
|
||||
_mm_shuffle_ps(v_row00, v_row01, shuffle_hi));
|
||||
__m128 v_dst1 = _mm_add_ps(_mm_shuffle_ps(v_row10, v_row11, shuffle_lo),
|
||||
_mm_shuffle_ps(v_row10, v_row11, shuffle_hi));
|
||||
|
||||
_mm_storeu_ps(D, _mm_mul_ps(_mm_add_ps(v_dst0, v_dst1), v_025));
|
||||
}
|
||||
}
|
||||
else if (cn == 4)
|
||||
{
|
||||
for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
|
||||
{
|
||||
__m128 v_dst0 = _mm_add_ps(_mm_loadu_ps(S0), _mm_loadu_ps(S0 + 4));
|
||||
__m128 v_dst1 = _mm_add_ps(_mm_loadu_ps(S1), _mm_loadu_ps(S1 + 4));
|
||||
|
||||
_mm_storeu_ps(D, _mm_mul_ps(_mm_add_ps(v_dst0, v_dst1), v_025));
|
||||
}
|
||||
}
|
||||
|
||||
return dx;
|
||||
}
|
||||
|
||||
private:
|
||||
int cn;
|
||||
bool fast_mode;
|
||||
int step;
|
||||
};
|
||||
|
||||
#else
|
||||
|
||||
@ -4678,6 +4816,13 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
|
||||
size.height = 1;
|
||||
}
|
||||
|
||||
#if CV_SSE2
|
||||
bool useSSE2 = checkHardwareSupport(CV_CPU_SSE2);
|
||||
#endif
|
||||
#if CV_SSE4_1
|
||||
bool useSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1);
|
||||
#endif
|
||||
|
||||
const float scale = 1.f/INTER_TAB_SIZE;
|
||||
int x, y;
|
||||
for( y = 0; y < size.height; y++ )
|
||||
@ -4708,6 +4853,29 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
|
||||
|
||||
vst2q_s16(dst1 + (x << 1), v_dst);
|
||||
}
|
||||
#elif CV_SSE4_1
|
||||
if (useSSE4_1)
|
||||
{
|
||||
for( ; x <= size.width - 16; x += 16 )
|
||||
{
|
||||
__m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x)),
|
||||
_mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 4)));
|
||||
__m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 8)),
|
||||
_mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 12)));
|
||||
|
||||
__m128i v_dst2 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src2f + x)),
|
||||
_mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 4)));
|
||||
__m128i v_dst3 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 8)),
|
||||
_mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 12)));
|
||||
|
||||
_mm_interleave_epi16(v_dst0, v_dst1, v_dst2, v_dst3);
|
||||
|
||||
_mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst0);
|
||||
_mm_storeu_si128((__m128i *)(dst1 + x * 2 + 8), v_dst1);
|
||||
_mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst2);
|
||||
_mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst3);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
for( ; x < size.width; x++ )
|
||||
{
|
||||
@ -4742,6 +4910,52 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
|
||||
vandq_s32(v_ix1, v_mask)));
|
||||
vst1q_u16(dst2 + x, vcombine_u16(v_dst0, v_dst1));
|
||||
}
|
||||
#elif CV_SSE4_1
|
||||
if (useSSE4_1)
|
||||
{
|
||||
__m128 v_its = _mm_set1_ps(INTER_TAB_SIZE);
|
||||
__m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE-1);
|
||||
|
||||
for( ; x <= size.width - 16; x += 16 )
|
||||
{
|
||||
__m128i v_ix0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x), v_its));
|
||||
__m128i v_ix1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 4), v_its));
|
||||
__m128i v_iy0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x), v_its));
|
||||
__m128i v_iy1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 4), v_its));
|
||||
|
||||
__m128i v_dst10 = _mm_packs_epi32(_mm_srai_epi32(v_ix0, INTER_BITS),
|
||||
_mm_srai_epi32(v_ix1, INTER_BITS));
|
||||
__m128i v_dst12 = _mm_packs_epi32(_mm_srai_epi32(v_iy0, INTER_BITS),
|
||||
_mm_srai_epi32(v_iy1, INTER_BITS));
|
||||
__m128i v_dst20 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy0, v_its1), INTER_BITS),
|
||||
_mm_and_si128(v_ix0, v_its1));
|
||||
__m128i v_dst21 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy1, v_its1), INTER_BITS),
|
||||
_mm_and_si128(v_ix1, v_its1));
|
||||
_mm_storeu_si128((__m128i *)(dst2 + x), _mm_packus_epi32(v_dst20, v_dst21));
|
||||
|
||||
v_ix0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 8), v_its));
|
||||
v_ix1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 12), v_its));
|
||||
v_iy0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 8), v_its));
|
||||
v_iy1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 12), v_its));
|
||||
|
||||
__m128i v_dst11 = _mm_packs_epi32(_mm_srai_epi32(v_ix0, INTER_BITS),
|
||||
_mm_srai_epi32(v_ix1, INTER_BITS));
|
||||
__m128i v_dst13 = _mm_packs_epi32(_mm_srai_epi32(v_iy0, INTER_BITS),
|
||||
_mm_srai_epi32(v_iy1, INTER_BITS));
|
||||
v_dst20 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy0, v_its1), INTER_BITS),
|
||||
_mm_and_si128(v_ix0, v_its1));
|
||||
v_dst21 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy1, v_its1), INTER_BITS),
|
||||
_mm_and_si128(v_ix1, v_its1));
|
||||
_mm_storeu_si128((__m128i *)(dst2 + x + 8), _mm_packus_epi32(v_dst20, v_dst21));
|
||||
|
||||
_mm_interleave_epi16(v_dst10, v_dst11, v_dst12, v_dst13);
|
||||
|
||||
_mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst10);
|
||||
_mm_storeu_si128((__m128i *)(dst1 + x * 2 + 8), v_dst11);
|
||||
_mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst12);
|
||||
_mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst13);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
for( ; x < size.width; x++ )
|
||||
{
|
||||
@ -4761,6 +4975,12 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
|
||||
for( ; x <= (size.width << 1) - 8; x += 8 )
|
||||
vst1q_s16(dst1 + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x))),
|
||||
vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x + 4)))));
|
||||
#elif CV_SSE2
|
||||
for( ; x <= (size.width << 1) - 8; x += 8 )
|
||||
{
|
||||
_mm_storeu_si128((__m128i *)(dst1 + x), _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x)),
|
||||
_mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 4))));
|
||||
}
|
||||
#endif
|
||||
for( ; x < size.width; x++ )
|
||||
{
|
||||
@ -4796,6 +5016,30 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
|
||||
vandq_s32(v_ix1, v_mask)));
|
||||
vst1q_u16(dst2 + x, vcombine_u16(v_dst0, v_dst1));
|
||||
}
|
||||
#elif CV_SSE4_1
|
||||
if (useSSE4_1)
|
||||
{
|
||||
__m128 v_its = _mm_set1_ps(INTER_TAB_SIZE);
|
||||
__m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE-1);
|
||||
__m128i v_y_mask = _mm_set1_epi32((INTER_TAB_SIZE-1) << 16);
|
||||
|
||||
for( ; x <= size.width - 4; x += 4 )
|
||||
{
|
||||
__m128i v_src0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x * 2), v_its));
|
||||
__m128i v_src1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x * 2 + 4), v_its));
|
||||
|
||||
__m128i v_dst1 = _mm_packs_epi32(_mm_srai_epi32(v_src0, INTER_BITS),
|
||||
_mm_srai_epi32(v_src1, INTER_BITS));
|
||||
_mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst1);
|
||||
|
||||
// x0 y0 x1 y1 . . .
|
||||
v_src0 = _mm_packs_epi32(_mm_and_si128(v_src0, v_its1),
|
||||
_mm_and_si128(v_src1, v_its1));
|
||||
__m128i v_dst2 = _mm_or_si128(_mm_srli_epi32(_mm_and_si128(v_src0, v_y_mask), 16 - INTER_BITS), // y0 0 y1 0 . . .
|
||||
_mm_and_si128(v_src0, v_its1)); // 0 x0 0 x1 . . .
|
||||
_mm_storel_epi64((__m128i *)(dst2 + x), _mm_packus_epi32(v_dst2, v_dst2));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
for( ; x < size.width; x++ )
|
||||
{
|
||||
@ -4841,6 +5085,44 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
|
||||
vst1q_f32(dst1f + x + 4, v_dst1);
|
||||
vst1q_f32(dst2f + x + 4, v_dst2);
|
||||
}
|
||||
#elif CV_SSE2
|
||||
__m128i v_mask2 = _mm_set1_epi16(INTER_TAB_SIZE2-1);
|
||||
__m128i v_zero = _mm_setzero_si128(), v_mask = _mm_set1_epi32(INTER_TAB_SIZE-1);
|
||||
__m128 v_scale = _mm_set1_ps(scale);
|
||||
|
||||
for( ; x <= size.width - 16; x += 16)
|
||||
{
|
||||
__m128i v_src10 = _mm_loadu_si128((__m128i const *)(src1 + x * 2));
|
||||
__m128i v_src11 = _mm_loadu_si128((__m128i const *)(src1 + x * 2 + 8));
|
||||
__m128i v_src20 = _mm_loadu_si128((__m128i const *)(src1 + x * 2 + 16));
|
||||
__m128i v_src21 = _mm_loadu_si128((__m128i const *)(src1 + x * 2 + 24));
|
||||
|
||||
_mm_deinterleave_epi16(v_src10, v_src11, v_src20, v_src21);
|
||||
|
||||
__m128i v_fxy = src2 ? _mm_and_si128(_mm_loadu_si128((__m128i const *)(src2 + x)), v_mask2) : v_zero;
|
||||
__m128i v_fxy_p = _mm_unpacklo_epi16(v_fxy, v_zero);
|
||||
_mm_storeu_ps(dst1f + x, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src10), 16)),
|
||||
_mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask)))));
|
||||
_mm_storeu_ps(dst2f + x, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src20), 16)),
|
||||
_mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS)))));
|
||||
v_fxy_p = _mm_unpackhi_epi16(v_fxy, v_zero);
|
||||
_mm_storeu_ps(dst1f + x + 4, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src10), 16)),
|
||||
_mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask)))));
|
||||
_mm_storeu_ps(dst2f + x + 4, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src20), 16)),
|
||||
_mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS)))));
|
||||
|
||||
v_fxy = src2 ? _mm_and_si128(_mm_loadu_si128((__m128i const *)(src2 + x + 8)), v_mask2) : v_zero;
|
||||
v_fxy_p = _mm_unpackhi_epi16(v_fxy, v_zero);
|
||||
_mm_storeu_ps(dst1f + x + 8, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src11), 16)),
|
||||
_mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask)))));
|
||||
_mm_storeu_ps(dst2f + x + 8, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src21), 16)),
|
||||
_mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS)))));
|
||||
v_fxy_p = _mm_unpackhi_epi16(v_fxy, v_zero);
|
||||
_mm_storeu_ps(dst1f + x + 12, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src11), 16)),
|
||||
_mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask)))));
|
||||
_mm_storeu_ps(dst2f + x + 12, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src21), 16)),
|
||||
_mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS)))));
|
||||
}
|
||||
#endif
|
||||
for( ; x < size.width; x++ )
|
||||
{
|
||||
@ -4882,6 +5164,27 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
|
||||
v_scale, vcvtq_f32_s32(vshrq_n_s32(v_fxy2, INTER_BITS)));
|
||||
vst2q_f32(dst1f + (x << 1) + 8, v_dst);
|
||||
}
|
||||
#elif CV_SSE2
|
||||
if (useSSE2)
|
||||
{
|
||||
__m128i v_mask2 = _mm_set1_epi16(INTER_TAB_SIZE2-1);
|
||||
__m128i v_zero = _mm_set1_epi32(0), v_mask = _mm_set1_epi32(INTER_TAB_SIZE-1);
|
||||
__m128 v_scale = _mm_set1_ps(scale);
|
||||
|
||||
for ( ; x <= size.width - 8; x += 8)
|
||||
{
|
||||
__m128i v_src = _mm_loadu_si128((__m128i const *)(src1 + x * 2));
|
||||
__m128i v_fxy = src2 ? _mm_and_si128(_mm_loadu_si128((__m128i const *)(src2 + x)), v_mask2) : v_zero;
|
||||
__m128i v_fxy1 = _mm_and_si128(v_fxy, v_mask);
|
||||
__m128i v_fxy2 = _mm_srli_epi16(v_fxy, INTER_BITS);
|
||||
|
||||
__m128 v_add = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_fxy1, v_fxy2)), v_scale);
|
||||
_mm_storeu_ps(dst1f + x * 2, _mm_add_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)), v_add));
|
||||
|
||||
v_add = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_fxy1, v_fxy2)), v_scale);
|
||||
_mm_storeu_ps(dst1f + x * 2, _mm_add_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)), v_add));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
for( ; x < size.width; x++ )
|
||||
{
|
||||
@ -4919,7 +5222,10 @@ public:
|
||||
const int AB_SCALE = 1 << AB_BITS;
|
||||
int round_delta = interpolation == INTER_NEAREST ? AB_SCALE/2 : AB_SCALE/INTER_TAB_SIZE/2, x, y, x1, y1;
|
||||
#if CV_SSE2
|
||||
bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
|
||||
bool useSSE2 = checkHardwareSupport(CV_CPU_SSE2);
|
||||
#endif
|
||||
#if CV_SSE4_1
|
||||
bool useSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1);
|
||||
#endif
|
||||
|
||||
int bh0 = std::min(BLOCK_SZ/2, dst.rows);
|
||||
@ -4957,6 +5263,31 @@ public:
|
||||
|
||||
vst2q_s16(xy + (x1 << 1), v_dst);
|
||||
}
|
||||
#elif CV_SSE4_1
|
||||
if (useSSE4_1)
|
||||
{
|
||||
__m128i v_X0 = _mm_set1_epi32(X0);
|
||||
__m128i v_Y0 = _mm_set1_epi32(Y0);
|
||||
for ( ; x1 <= bw - 16; x1 += 16)
|
||||
{
|
||||
__m128i v_x0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1))), AB_BITS),
|
||||
_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 4))), AB_BITS));
|
||||
__m128i v_x1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 8))), AB_BITS),
|
||||
_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 12))), AB_BITS));
|
||||
|
||||
__m128i v_y0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1))), AB_BITS),
|
||||
_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 4))), AB_BITS));
|
||||
__m128i v_y1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 8))), AB_BITS),
|
||||
_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 12))), AB_BITS));
|
||||
|
||||
_mm_interleave_epi16(v_x0, v_x1, v_y0, v_y1);
|
||||
|
||||
_mm_storeu_si128((__m128i *)(xy + x1 * 2), v_x0);
|
||||
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_x1);
|
||||
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_y0);
|
||||
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_y1);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
for( ; x1 < bw; x1++ )
|
||||
{
|
||||
@ -4971,7 +5302,7 @@ public:
|
||||
short* alpha = A + y1*bw;
|
||||
x1 = 0;
|
||||
#if CV_SSE2
|
||||
if( useSIMD )
|
||||
if( useSSE2 )
|
||||
{
|
||||
__m128i fxy_mask = _mm_set1_epi32(INTER_TAB_SIZE - 1);
|
||||
__m128i XX = _mm_set1_epi32(X0), YY = _mm_set1_epi32(Y0);
|
||||
@ -5364,6 +5695,20 @@ public:
|
||||
int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, width);
|
||||
bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, height);
|
||||
|
||||
#if CV_SSE4_1
|
||||
bool haveSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1);
|
||||
__m128d v_M0 = _mm_set1_pd(M[0]);
|
||||
__m128d v_M3 = _mm_set1_pd(M[3]);
|
||||
__m128d v_M6 = _mm_set1_pd(M[6]);
|
||||
__m128d v_intmax = _mm_set1_pd((double)INT_MAX);
|
||||
__m128d v_intmin = _mm_set1_pd((double)INT_MIN);
|
||||
__m128d v_2 = _mm_set1_pd(2),
|
||||
v_zero = _mm_setzero_pd(),
|
||||
v_1 = _mm_set1_pd(1),
|
||||
v_its = _mm_set1_pd(INTER_TAB_SIZE);
|
||||
__m128i v_itsi1 = _mm_set1_epi32(INTER_TAB_SIZE - 1);
|
||||
#endif
|
||||
|
||||
for( y = range.start; y < range.end; y += bh0 )
|
||||
{
|
||||
for( x = 0; x < width; x += bw0 )
|
||||
@ -5382,7 +5727,120 @@ public:
|
||||
double W0 = M[6]*x + M[7]*(y + y1) + M[8];
|
||||
|
||||
if( interpolation == INTER_NEAREST )
|
||||
for( x1 = 0; x1 < bw; x1++ )
|
||||
{
|
||||
x1 = 0;
|
||||
|
||||
#if CV_SSE4_1
|
||||
if (haveSSE4_1)
|
||||
{
|
||||
__m128d v_X0d = _mm_set1_pd(X0);
|
||||
__m128d v_Y0d = _mm_set1_pd(Y0);
|
||||
__m128d v_W0 = _mm_set1_pd(W0);
|
||||
__m128d v_x1 = _mm_set_pd(1, 0);
|
||||
|
||||
for( ; x1 <= bw - 16; x1 += 16 )
|
||||
{
|
||||
// 0-3
|
||||
__m128i v_X0, v_Y0;
|
||||
{
|
||||
__m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
|
||||
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
|
||||
__m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
|
||||
__m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
|
||||
v_x1 = _mm_add_pd(v_x1, v_2);
|
||||
|
||||
v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
|
||||
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
|
||||
__m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
|
||||
__m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
|
||||
v_x1 = _mm_add_pd(v_x1, v_2);
|
||||
|
||||
v_X0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
|
||||
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
|
||||
v_Y0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
|
||||
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
|
||||
}
|
||||
|
||||
// 4-8
|
||||
__m128i v_X1, v_Y1;
|
||||
{
|
||||
__m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
|
||||
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
|
||||
__m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
|
||||
__m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
|
||||
v_x1 = _mm_add_pd(v_x1, v_2);
|
||||
|
||||
v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
|
||||
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
|
||||
__m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
|
||||
__m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
|
||||
v_x1 = _mm_add_pd(v_x1, v_2);
|
||||
|
||||
v_X1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
|
||||
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
|
||||
v_Y1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
|
||||
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
|
||||
}
|
||||
|
||||
// 8-11
|
||||
__m128i v_X2, v_Y2;
|
||||
{
|
||||
__m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
|
||||
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
|
||||
__m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
|
||||
__m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
|
||||
v_x1 = _mm_add_pd(v_x1, v_2);
|
||||
|
||||
v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
|
||||
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
|
||||
__m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
|
||||
__m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
|
||||
v_x1 = _mm_add_pd(v_x1, v_2);
|
||||
|
||||
v_X2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
|
||||
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
|
||||
v_Y2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
|
||||
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
|
||||
}
|
||||
|
||||
// 12-15
|
||||
__m128i v_X3, v_Y3;
|
||||
{
|
||||
__m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
|
||||
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
|
||||
__m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
|
||||
__m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
|
||||
v_x1 = _mm_add_pd(v_x1, v_2);
|
||||
|
||||
v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
|
||||
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
|
||||
__m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
|
||||
__m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
|
||||
v_x1 = _mm_add_pd(v_x1, v_2);
|
||||
|
||||
v_X3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
|
||||
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
|
||||
v_Y3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
|
||||
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
|
||||
}
|
||||
|
||||
// convert to 16s
|
||||
v_X0 = _mm_packs_epi32(v_X0, v_X1);
|
||||
v_X1 = _mm_packs_epi32(v_X2, v_X3);
|
||||
v_Y0 = _mm_packs_epi32(v_Y0, v_Y1);
|
||||
v_Y1 = _mm_packs_epi32(v_Y2, v_Y3);
|
||||
|
||||
_mm_interleave_epi16(v_X0, v_X1, v_Y0, v_Y1);
|
||||
|
||||
_mm_storeu_si128((__m128i *)(xy + x1 * 2), v_X0);
|
||||
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_X1);
|
||||
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0);
|
||||
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for( ; x1 < bw; x1++ )
|
||||
{
|
||||
double W = W0 + M[6]*x1;
|
||||
W = W ? 1./W : 0;
|
||||
@ -5394,10 +5852,136 @@ public:
|
||||
xy[x1*2] = saturate_cast<short>(X);
|
||||
xy[x1*2+1] = saturate_cast<short>(Y);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
short* alpha = A + y1*bw;
|
||||
for( x1 = 0; x1 < bw; x1++ )
|
||||
x1 = 0;
|
||||
|
||||
#if CV_SSE4_1
|
||||
if (haveSSE4_1)
|
||||
{
|
||||
__m128d v_X0d = _mm_set1_pd(X0);
|
||||
__m128d v_Y0d = _mm_set1_pd(Y0);
|
||||
__m128d v_W0 = _mm_set1_pd(W0);
|
||||
__m128d v_x1 = _mm_set_pd(1, 0);
|
||||
|
||||
for( ; x1 <= bw - 16; x1 += 16 )
|
||||
{
|
||||
// 0-3
|
||||
__m128i v_X0, v_Y0;
|
||||
{
|
||||
__m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
|
||||
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
|
||||
__m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
|
||||
__m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
|
||||
v_x1 = _mm_add_pd(v_x1, v_2);
|
||||
|
||||
v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
|
||||
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
|
||||
__m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
|
||||
__m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
|
||||
v_x1 = _mm_add_pd(v_x1, v_2);
|
||||
|
||||
v_X0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
|
||||
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
|
||||
v_Y0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
|
||||
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
|
||||
}
|
||||
|
||||
// 4-8
|
||||
__m128i v_X1, v_Y1;
|
||||
{
|
||||
__m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
|
||||
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
|
||||
__m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
|
||||
__m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
|
||||
v_x1 = _mm_add_pd(v_x1, v_2);
|
||||
|
||||
v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
|
||||
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
|
||||
__m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
|
||||
__m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
|
||||
v_x1 = _mm_add_pd(v_x1, v_2);
|
||||
|
||||
v_X1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
|
||||
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
|
||||
v_Y1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
|
||||
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
|
||||
}
|
||||
|
||||
// 8-11
|
||||
__m128i v_X2, v_Y2;
|
||||
{
|
||||
__m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
|
||||
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
|
||||
__m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
|
||||
__m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
|
||||
v_x1 = _mm_add_pd(v_x1, v_2);
|
||||
|
||||
v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
|
||||
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
|
||||
__m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
|
||||
__m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
|
||||
v_x1 = _mm_add_pd(v_x1, v_2);
|
||||
|
||||
v_X2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
|
||||
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
|
||||
v_Y2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
|
||||
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
|
||||
}
|
||||
|
||||
// 12-15
|
||||
__m128i v_X3, v_Y3;
|
||||
{
|
||||
__m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
|
||||
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
|
||||
__m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
|
||||
__m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
|
||||
v_x1 = _mm_add_pd(v_x1, v_2);
|
||||
|
||||
v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
|
||||
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
|
||||
__m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
|
||||
__m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
|
||||
v_x1 = _mm_add_pd(v_x1, v_2);
|
||||
|
||||
v_X3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
|
||||
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
|
||||
v_Y3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
|
||||
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
|
||||
}
|
||||
|
||||
// store alpha
|
||||
__m128i v_alpha0 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y0, v_itsi1), INTER_BITS),
|
||||
_mm_and_si128(v_X0, v_itsi1));
|
||||
__m128i v_alpha1 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y1, v_itsi1), INTER_BITS),
|
||||
_mm_and_si128(v_X1, v_itsi1));
|
||||
_mm_storeu_si128((__m128i *)(alpha + x1), _mm_packs_epi32(v_alpha0, v_alpha1));
|
||||
|
||||
v_alpha0 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y2, v_itsi1), INTER_BITS),
|
||||
_mm_and_si128(v_X2, v_itsi1));
|
||||
v_alpha1 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y3, v_itsi1), INTER_BITS),
|
||||
_mm_and_si128(v_X3, v_itsi1));
|
||||
_mm_storeu_si128((__m128i *)(alpha + x1 + 8), _mm_packs_epi32(v_alpha0, v_alpha1));
|
||||
|
||||
// convert to 16s
|
||||
v_X0 = _mm_packs_epi32(_mm_srai_epi32(v_X0, INTER_BITS), _mm_srai_epi32(v_X1, INTER_BITS));
|
||||
v_X1 = _mm_packs_epi32(_mm_srai_epi32(v_X2, INTER_BITS), _mm_srai_epi32(v_X3, INTER_BITS));
|
||||
v_Y0 = _mm_packs_epi32(_mm_srai_epi32(v_Y0, INTER_BITS), _mm_srai_epi32(v_Y1, INTER_BITS));
|
||||
v_Y1 = _mm_packs_epi32(_mm_srai_epi32(v_Y2, INTER_BITS), _mm_srai_epi32(v_Y3, INTER_BITS));
|
||||
|
||||
_mm_interleave_epi16(v_X0, v_X1, v_Y0, v_Y1);
|
||||
|
||||
_mm_storeu_si128((__m128i *)(xy + x1 * 2), v_X0);
|
||||
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_X1);
|
||||
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0);
|
||||
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for( ; x1 < bw; x1++ )
|
||||
{
|
||||
double W = W0 + M[6]*x1;
|
||||
W = W ? INTER_TAB_SIZE/W : 0;
|
||||
|
@ -12,6 +12,7 @@
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
@ -183,13 +184,336 @@ struct PyrDownVec_32f
|
||||
}
|
||||
};
|
||||
|
||||
typedef PyrDownNoVec<int, ushort> PyrDownVec_32s16u;
|
||||
typedef PyrDownNoVec<int, short> PyrDownVec_32s16s;
|
||||
#if CV_SSE4_1
|
||||
|
||||
struct PyrDownVec_32s16u
|
||||
{
|
||||
PyrDownVec_32s16u()
|
||||
{
|
||||
haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
|
||||
}
|
||||
|
||||
int operator()(int** src, ushort* dst, int, int width) const
|
||||
{
|
||||
int x = 0;
|
||||
|
||||
if (!haveSSE)
|
||||
return x;
|
||||
|
||||
const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
|
||||
__m128i v_delta = _mm_set1_epi32(128);
|
||||
|
||||
for( ; x <= width - 8; x += 8 )
|
||||
{
|
||||
__m128i v_r00 = _mm_loadu_si128((__m128i const *)(row0 + x)),
|
||||
v_r01 = _mm_loadu_si128((__m128i const *)(row0 + x + 4));
|
||||
__m128i v_r10 = _mm_loadu_si128((__m128i const *)(row1 + x)),
|
||||
v_r11 = _mm_loadu_si128((__m128i const *)(row1 + x + 4));
|
||||
__m128i v_r20 = _mm_loadu_si128((__m128i const *)(row2 + x)),
|
||||
v_r21 = _mm_loadu_si128((__m128i const *)(row2 + x + 4));
|
||||
__m128i v_r30 = _mm_loadu_si128((__m128i const *)(row3 + x)),
|
||||
v_r31 = _mm_loadu_si128((__m128i const *)(row3 + x + 4));
|
||||
__m128i v_r40 = _mm_loadu_si128((__m128i const *)(row4 + x)),
|
||||
v_r41 = _mm_loadu_si128((__m128i const *)(row4 + x + 4));
|
||||
|
||||
v_r00 = _mm_add_epi32(_mm_add_epi32(v_r00, v_r40), _mm_add_epi32(v_r20, v_r20));
|
||||
v_r10 = _mm_add_epi32(_mm_add_epi32(v_r10, v_r20), v_r30);
|
||||
|
||||
v_r10 = _mm_slli_epi32(v_r10, 2);
|
||||
__m128i v_dst0 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(v_r00, v_r10), v_delta), 8);
|
||||
|
||||
v_r01 = _mm_add_epi32(_mm_add_epi32(v_r01, v_r41), _mm_add_epi32(v_r21, v_r21));
|
||||
v_r11 = _mm_add_epi32(_mm_add_epi32(v_r11, v_r21), v_r31);
|
||||
v_r11 = _mm_slli_epi32(v_r11, 2);
|
||||
__m128i v_dst1 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(v_r01, v_r11), v_delta), 8);
|
||||
|
||||
_mm_storeu_si128((__m128i *)(dst + x), _mm_packus_epi32(v_dst0, v_dst1));
|
||||
}
|
||||
|
||||
return x;
|
||||
}
|
||||
|
||||
bool haveSSE;
|
||||
};
|
||||
|
||||
#else
|
||||
|
||||
typedef PyrDownNoVec<int, ushort> PyrDownVec_32s16u;
|
||||
|
||||
#endif // CV_SSE4_1
|
||||
|
||||
struct PyrDownVec_32s16s
|
||||
{
|
||||
PyrDownVec_32s16s()
|
||||
{
|
||||
haveSSE = checkHardwareSupport(CV_CPU_SSE2);
|
||||
}
|
||||
|
||||
int operator()(int** src, short* dst, int, int width) const
|
||||
{
|
||||
int x = 0;
|
||||
|
||||
if (!haveSSE)
|
||||
return x;
|
||||
|
||||
const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
|
||||
__m128i v_delta = _mm_set1_epi32(128);
|
||||
|
||||
for( ; x <= width - 8; x += 8 )
|
||||
{
|
||||
__m128i v_r00 = _mm_loadu_si128((__m128i const *)(row0 + x)),
|
||||
v_r01 = _mm_loadu_si128((__m128i const *)(row0 + x + 4));
|
||||
__m128i v_r10 = _mm_loadu_si128((__m128i const *)(row1 + x)),
|
||||
v_r11 = _mm_loadu_si128((__m128i const *)(row1 + x + 4));
|
||||
__m128i v_r20 = _mm_loadu_si128((__m128i const *)(row2 + x)),
|
||||
v_r21 = _mm_loadu_si128((__m128i const *)(row2 + x + 4));
|
||||
__m128i v_r30 = _mm_loadu_si128((__m128i const *)(row3 + x)),
|
||||
v_r31 = _mm_loadu_si128((__m128i const *)(row3 + x + 4));
|
||||
__m128i v_r40 = _mm_loadu_si128((__m128i const *)(row4 + x)),
|
||||
v_r41 = _mm_loadu_si128((__m128i const *)(row4 + x + 4));
|
||||
|
||||
v_r00 = _mm_add_epi32(_mm_add_epi32(v_r00, v_r40), _mm_add_epi32(v_r20, v_r20));
|
||||
v_r10 = _mm_add_epi32(_mm_add_epi32(v_r10, v_r20), v_r30);
|
||||
|
||||
v_r10 = _mm_slli_epi32(v_r10, 2);
|
||||
__m128i v_dst0 = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(v_r00, v_r10), v_delta), 8);
|
||||
|
||||
v_r01 = _mm_add_epi32(_mm_add_epi32(v_r01, v_r41), _mm_add_epi32(v_r21, v_r21));
|
||||
v_r11 = _mm_add_epi32(_mm_add_epi32(v_r11, v_r21), v_r31);
|
||||
v_r11 = _mm_slli_epi32(v_r11, 2);
|
||||
__m128i v_dst1 = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(v_r01, v_r11), v_delta), 8);
|
||||
|
||||
_mm_storeu_si128((__m128i *)(dst + x), _mm_packs_epi32(v_dst0, v_dst1));
|
||||
}
|
||||
|
||||
return x;
|
||||
}
|
||||
|
||||
bool haveSSE;
|
||||
};
|
||||
|
||||
struct PyrUpVec_32s8u
|
||||
{
|
||||
int operator()(int** src, uchar** dst, int, int width) const
|
||||
{
|
||||
int x = 0;
|
||||
|
||||
if (!checkHardwareSupport(CV_CPU_SSE2))
|
||||
return x;
|
||||
|
||||
uchar *dst0 = dst[0], *dst1 = dst[1];
|
||||
const uint *row0 = (uint *)src[0], *row1 = (uint *)src[1], *row2 = (uint *)src[2];
|
||||
__m128i v_delta = _mm_set1_epi16(32), v_zero = _mm_setzero_si128();
|
||||
|
||||
for( ; x <= width - 16; x += 16 )
|
||||
{
|
||||
__m128i v_r0 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row0 + x)),
|
||||
_mm_loadu_si128((__m128i const *)(row0 + x + 4)));
|
||||
__m128i v_r1 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row1 + x)),
|
||||
_mm_loadu_si128((__m128i const *)(row1 + x + 4)));
|
||||
__m128i v_r2 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row2 + x)),
|
||||
_mm_loadu_si128((__m128i const *)(row2 + x + 4)));
|
||||
|
||||
__m128i v_2r1 = _mm_adds_epu16(v_r1, v_r1), v_4r1 = _mm_adds_epu16(v_2r1, v_2r1);
|
||||
__m128i v_dst00 = _mm_adds_epu16(_mm_adds_epu16(v_r0, v_r2), _mm_adds_epu16(v_2r1, v_4r1));
|
||||
__m128i v_dst10 = _mm_slli_epi16(_mm_adds_epu16(v_r1, v_r2), 2);
|
||||
|
||||
v_r0 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row0 + x + 8)),
|
||||
_mm_loadu_si128((__m128i const *)(row0 + x + 12)));
|
||||
v_r1 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row1 + x + 8)),
|
||||
_mm_loadu_si128((__m128i const *)(row1 + x + 12)));
|
||||
v_r2 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row2 + x + 8)),
|
||||
_mm_loadu_si128((__m128i const *)(row2 + x + 12)));
|
||||
|
||||
v_2r1 = _mm_adds_epu16(v_r1, v_r1), v_4r1 = _mm_adds_epu16(v_2r1, v_2r1);
|
||||
__m128i v_dst01 = _mm_adds_epu16(_mm_adds_epu16(v_r0, v_r2), _mm_adds_epu16(v_2r1, v_4r1));
|
||||
__m128i v_dst11 = _mm_slli_epi16(_mm_adds_epu16(v_r1, v_r2), 2);
|
||||
|
||||
_mm_storeu_si128((__m128i *)(dst0 + x), _mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(v_dst00, v_delta), 6),
|
||||
_mm_srli_epi16(_mm_adds_epu16(v_dst01, v_delta), 6)));
|
||||
_mm_storeu_si128((__m128i *)(dst1 + x), _mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(v_dst10, v_delta), 6),
|
||||
_mm_srli_epi16(_mm_adds_epu16(v_dst11, v_delta), 6)));
|
||||
}
|
||||
|
||||
for( ; x <= width - 8; x += 8 )
|
||||
{
|
||||
__m128i v_r0 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row0 + x)),
|
||||
_mm_loadu_si128((__m128i const *)(row0 + x + 4)));
|
||||
__m128i v_r1 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row1 + x)),
|
||||
_mm_loadu_si128((__m128i const *)(row1 + x + 4)));
|
||||
__m128i v_r2 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row2 + x)),
|
||||
_mm_loadu_si128((__m128i const *)(row2 + x + 4)));
|
||||
|
||||
__m128i v_2r1 = _mm_adds_epu16(v_r1, v_r1), v_4r1 = _mm_adds_epu16(v_2r1, v_2r1);
|
||||
__m128i v_dst0 = _mm_adds_epu16(_mm_adds_epu16(v_r0, v_r2), _mm_adds_epu16(v_2r1, v_4r1));
|
||||
__m128i v_dst1 = _mm_slli_epi16(_mm_adds_epu16(v_r1, v_r2), 2);
|
||||
|
||||
_mm_storel_epi64((__m128i *)(dst0 + x), _mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(v_dst0, v_delta), 6), v_zero));
|
||||
_mm_storel_epi64((__m128i *)(dst1 + x), _mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(v_dst1, v_delta), 6), v_zero));
|
||||
}
|
||||
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
struct PyrUpVec_32s16s
|
||||
{
|
||||
int operator()(int** src, short** dst, int, int width) const
|
||||
{
|
||||
int x = 0;
|
||||
|
||||
if (!checkHardwareSupport(CV_CPU_SSE2))
|
||||
return x;
|
||||
|
||||
short *dst0 = dst[0], *dst1 = dst[1];
|
||||
const uint *row0 = (uint *)src[0], *row1 = (uint *)src[1], *row2 = (uint *)src[2];
|
||||
__m128i v_delta = _mm_set1_epi32(32), v_zero = _mm_setzero_si128();
|
||||
|
||||
for( ; x <= width - 8; x += 8 )
|
||||
{
|
||||
__m128i v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x)),
|
||||
v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x)),
|
||||
v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x));
|
||||
__m128i v_2r1 = _mm_slli_epi32(v_r1, 1), v_4r1 = _mm_slli_epi32(v_r1, 2);
|
||||
__m128i v_dst00 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1));
|
||||
__m128i v_dst10 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2);
|
||||
|
||||
v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x + 4));
|
||||
v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x + 4));
|
||||
v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x + 4));
|
||||
v_2r1 = _mm_slli_epi32(v_r1, 1);
|
||||
v_4r1 = _mm_slli_epi32(v_r1, 2);
|
||||
__m128i v_dst01 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1));
|
||||
__m128i v_dst11 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2);
|
||||
|
||||
_mm_storeu_si128((__m128i *)(dst0 + x),
|
||||
_mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_dst00, v_delta), 6),
|
||||
_mm_srai_epi32(_mm_add_epi32(v_dst01, v_delta), 6)));
|
||||
_mm_storeu_si128((__m128i *)(dst1 + x),
|
||||
_mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_dst10, v_delta), 6),
|
||||
_mm_srai_epi32(_mm_add_epi32(v_dst11, v_delta), 6)));
|
||||
}
|
||||
|
||||
for( ; x <= width - 4; x += 4 )
|
||||
{
|
||||
__m128i v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x)),
|
||||
v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x)),
|
||||
v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x));
|
||||
__m128i v_2r1 = _mm_slli_epi32(v_r1, 1), v_4r1 = _mm_slli_epi32(v_r1, 2);
|
||||
|
||||
__m128i v_dst0 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1));
|
||||
__m128i v_dst1 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2);
|
||||
|
||||
_mm_storel_epi64((__m128i *)(dst0 + x),
|
||||
_mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_dst0, v_delta), 6), v_zero));
|
||||
_mm_storel_epi64((__m128i *)(dst1 + x),
|
||||
_mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_dst1, v_delta), 6), v_zero));
|
||||
}
|
||||
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
#if CV_SSE4_1
|
||||
|
||||
struct PyrUpVec_32s16u
|
||||
{
|
||||
int operator()(int** src, ushort** dst, int, int width) const
|
||||
{
|
||||
int x = 0;
|
||||
|
||||
if (!checkHardwareSupport(CV_CPU_SSE4_1))
|
||||
return x;
|
||||
|
||||
ushort *dst0 = dst[0], *dst1 = dst[1];
|
||||
const uint *row0 = (uint *)src[0], *row1 = (uint *)src[1], *row2 = (uint *)src[2];
|
||||
__m128i v_delta = _mm_set1_epi32(32), v_zero = _mm_setzero_si128();
|
||||
|
||||
for( ; x <= width - 8; x += 8 )
|
||||
{
|
||||
__m128i v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x)),
|
||||
v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x)),
|
||||
v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x));
|
||||
__m128i v_2r1 = _mm_slli_epi32(v_r1, 1), v_4r1 = _mm_slli_epi32(v_r1, 2);
|
||||
__m128i v_dst00 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1));
|
||||
__m128i v_dst10 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2);
|
||||
|
||||
v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x + 4));
|
||||
v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x + 4));
|
||||
v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x + 4));
|
||||
v_2r1 = _mm_slli_epi32(v_r1, 1);
|
||||
v_4r1 = _mm_slli_epi32(v_r1, 2);
|
||||
__m128i v_dst01 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1));
|
||||
__m128i v_dst11 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2);
|
||||
|
||||
_mm_storeu_si128((__m128i *)(dst0 + x),
|
||||
_mm_packus_epi32(_mm_srli_epi32(_mm_add_epi32(v_dst00, v_delta), 6),
|
||||
_mm_srli_epi32(_mm_add_epi32(v_dst01, v_delta), 6)));
|
||||
_mm_storeu_si128((__m128i *)(dst1 + x),
|
||||
_mm_packus_epi32(_mm_srli_epi32(_mm_add_epi32(v_dst10, v_delta), 6),
|
||||
_mm_srli_epi32(_mm_add_epi32(v_dst11, v_delta), 6)));
|
||||
}
|
||||
|
||||
for( ; x <= width - 4; x += 4 )
|
||||
{
|
||||
__m128i v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x)),
|
||||
v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x)),
|
||||
v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x));
|
||||
__m128i v_2r1 = _mm_slli_epi32(v_r1, 1), v_4r1 = _mm_slli_epi32(v_r1, 2);
|
||||
|
||||
__m128i v_dst0 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1));
|
||||
__m128i v_dst1 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2);
|
||||
|
||||
_mm_storel_epi64((__m128i *)(dst0 + x),
|
||||
_mm_packus_epi32(_mm_srli_epi32(_mm_add_epi32(v_dst0, v_delta), 6), v_zero));
|
||||
_mm_storel_epi64((__m128i *)(dst1 + x),
|
||||
_mm_packus_epi32(_mm_srli_epi32(_mm_add_epi32(v_dst1, v_delta), 6), v_zero));
|
||||
}
|
||||
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
#else
|
||||
|
||||
typedef PyrUpNoVec<int, uchar> PyrUpVec_32s8u;
|
||||
typedef PyrUpNoVec<int, short> PyrUpVec_32s16s;
|
||||
typedef PyrUpNoVec<int, ushort> PyrUpVec_32s16u;
|
||||
typedef PyrUpNoVec<float, float> PyrUpVec_32f;
|
||||
|
||||
#endif // CV_SSE4_1
|
||||
|
||||
struct PyrUpVec_32f
|
||||
{
|
||||
int operator()(float** src, float** dst, int, int width) const
|
||||
{
|
||||
int x = 0;
|
||||
|
||||
if (!checkHardwareSupport(CV_CPU_SSE2))
|
||||
return x;
|
||||
|
||||
const float *row0 = src[0], *row1 = src[1], *row2 = src[2];
|
||||
float *dst0 = dst[0], *dst1 = dst[1];
|
||||
__m128 v_6 = _mm_set1_ps(6.0f), v_scale = _mm_set1_ps(1.f/64.0f),
|
||||
v_scale4 = _mm_mul_ps(v_scale, _mm_set1_ps(4.0f));
|
||||
|
||||
for( ; x <= width - 8; x += 8 )
|
||||
{
|
||||
__m128 v_r0 = _mm_loadu_ps(row0 + x);
|
||||
__m128 v_r1 = _mm_loadu_ps(row1 + x);
|
||||
__m128 v_r2 = _mm_loadu_ps(row2 + x);
|
||||
|
||||
_mm_storeu_ps(dst1 + x, _mm_mul_ps(v_scale4, _mm_add_ps(v_r1, v_r2)));
|
||||
_mm_storeu_ps(dst0 + x, _mm_mul_ps(v_scale, _mm_add_ps(_mm_add_ps(v_r0, _mm_mul_ps(v_6, v_r1)), v_r2)));
|
||||
|
||||
v_r0 = _mm_loadu_ps(row0 + x + 4);
|
||||
v_r1 = _mm_loadu_ps(row1 + x + 4);
|
||||
v_r2 = _mm_loadu_ps(row2 + x + 4);
|
||||
|
||||
_mm_storeu_ps(dst1 + x + 4, _mm_mul_ps(v_scale4, _mm_add_ps(v_r1, v_r2)));
|
||||
_mm_storeu_ps(dst0 + x + 4, _mm_mul_ps(v_scale, _mm_add_ps(_mm_add_ps(v_r0, _mm_mul_ps(v_6, v_r1)), v_r2)));
|
||||
}
|
||||
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
#elif CV_NEON
|
||||
|
||||
|
@ -12,6 +12,7 @@
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
@ -713,6 +714,156 @@ struct ColumnSum<int, ushort> :
|
||||
std::vector<int> sum;
|
||||
};
|
||||
|
||||
template<>
|
||||
struct ColumnSum<int, int> :
|
||||
public BaseColumnFilter
|
||||
{
|
||||
ColumnSum( int _ksize, int _anchor, double _scale ) :
|
||||
BaseColumnFilter()
|
||||
{
|
||||
ksize = _ksize;
|
||||
anchor = _anchor;
|
||||
scale = _scale;
|
||||
sumCount = 0;
|
||||
}
|
||||
|
||||
virtual void reset() { sumCount = 0; }
|
||||
|
||||
virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
|
||||
{
|
||||
int i;
|
||||
int* SUM;
|
||||
bool haveScale = scale != 1;
|
||||
double _scale = scale;
|
||||
|
||||
#if CV_SSE2
|
||||
bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
|
||||
#endif
|
||||
|
||||
if( width != (int)sum.size() )
|
||||
{
|
||||
sum.resize(width);
|
||||
sumCount = 0;
|
||||
}
|
||||
SUM = &sum[0];
|
||||
if( sumCount == 0 )
|
||||
{
|
||||
memset((void*)SUM, 0, width*sizeof(int));
|
||||
for( ; sumCount < ksize - 1; sumCount++, src++ )
|
||||
{
|
||||
const int* Sp = (const int*)src[0];
|
||||
i = 0;
|
||||
#if CV_SSE2
|
||||
if(haveSSE2)
|
||||
{
|
||||
for( ; i <= width-4; i+=4 )
|
||||
{
|
||||
__m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i));
|
||||
__m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i));
|
||||
_mm_storeu_si128((__m128i*)(SUM+i),_mm_add_epi32(_sum, _sp));
|
||||
}
|
||||
}
|
||||
#elif CV_NEON
|
||||
for( ; i <= width - 4; i+=4 )
|
||||
vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)));
|
||||
#endif
|
||||
for( ; i < width; i++ )
|
||||
SUM[i] += Sp[i];
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
CV_Assert( sumCount == ksize-1 );
|
||||
src += ksize-1;
|
||||
}
|
||||
|
||||
for( ; count--; src++ )
|
||||
{
|
||||
const int* Sp = (const int*)src[0];
|
||||
const int* Sm = (const int*)src[1-ksize];
|
||||
int* D = (int*)dst;
|
||||
if( haveScale )
|
||||
{
|
||||
i = 0;
|
||||
#if CV_SSE2
|
||||
if(haveSSE2)
|
||||
{
|
||||
const __m128 scale4 = _mm_set1_ps((float)_scale);
|
||||
for( ; i <= width-4; i+=4 )
|
||||
{
|
||||
__m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i));
|
||||
|
||||
__m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
|
||||
_mm_loadu_si128((const __m128i*)(Sp+i)));
|
||||
|
||||
__m128i _s0T = _mm_cvtps_epi32(_mm_mul_ps(scale4, _mm_cvtepi32_ps(_s0)));
|
||||
|
||||
_mm_storeu_si128((__m128i*)(D+i), _s0T);
|
||||
_mm_storeu_si128((__m128i*)(SUM+i),_mm_sub_epi32(_s0,_sm));
|
||||
}
|
||||
}
|
||||
#elif CV_NEON
|
||||
float32x4_t v_scale = vdupq_n_f32((float)_scale);
|
||||
for( ; i <= width-4; i+=4 )
|
||||
{
|
||||
int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
|
||||
|
||||
int32x4_t v_s0d = cv_vrndq_s32_f32(vmulq_f32(vcvtq_f32_s32(v_s0), v_scale));
|
||||
vst1q_s32(D + i, v_s0d);
|
||||
|
||||
vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
|
||||
}
|
||||
#endif
|
||||
for( ; i < width; i++ )
|
||||
{
|
||||
int s0 = SUM[i] + Sp[i];
|
||||
D[i] = saturate_cast<int>(s0*_scale);
|
||||
SUM[i] = s0 - Sm[i];
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
i = 0;
|
||||
#if CV_SSE2
|
||||
if(haveSSE2)
|
||||
{
|
||||
for( ; i <= width-4; i+=4 )
|
||||
{
|
||||
__m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i));
|
||||
__m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
|
||||
_mm_loadu_si128((const __m128i*)(Sp+i)));
|
||||
|
||||
_mm_storeu_si128((__m128i*)(D+i), _s0);
|
||||
_mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm));
|
||||
}
|
||||
}
|
||||
#elif CV_NEON
|
||||
for( ; i <= width-4; i+=4 )
|
||||
{
|
||||
int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
|
||||
|
||||
vst1q_s32(D + i, v_s0);
|
||||
vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
|
||||
}
|
||||
#endif
|
||||
|
||||
for( ; i < width; i++ )
|
||||
{
|
||||
int s0 = SUM[i] + Sp[i];
|
||||
D[i] = s0;
|
||||
SUM[i] = s0 - Sm[i];
|
||||
}
|
||||
}
|
||||
dst += dststep;
|
||||
}
|
||||
}
|
||||
|
||||
double scale;
|
||||
int sumCount;
|
||||
std::vector<int> sum;
|
||||
};
|
||||
|
||||
|
||||
template<>
|
||||
struct ColumnSum<int, float> :
|
||||
public BaseColumnFilter
|
||||
|
@ -12,6 +12,7 @@
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2014, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
|
@ -1595,7 +1595,10 @@ void resizeArea(const cv::Mat & src, cv::Mat & dst)
|
||||
TEST(Resize, Area_half)
|
||||
{
|
||||
const int size = 1000;
|
||||
int types[] = { CV_8UC1, CV_8UC4, CV_16UC1, CV_16UC4, CV_16SC1, CV_16SC4, CV_32FC1, CV_32FC4 };
|
||||
int types[] = { CV_8UC1, CV_8UC4,
|
||||
CV_16UC1, CV_16UC4,
|
||||
CV_16SC1, CV_16SC3, CV_16SC4,
|
||||
CV_32FC1, CV_32FC4 };
|
||||
|
||||
cv::RNG rng(17);
|
||||
|
||||
|
@ -64,6 +64,7 @@ TEST(Photo_SeamlessClone_normal, regression)
|
||||
string original_path1 = folder + "source1.png";
|
||||
string original_path2 = folder + "destination1.png";
|
||||
string original_path3 = folder + "mask.png";
|
||||
string reference_path = folder + "reference.png";
|
||||
|
||||
Mat source = imread(original_path1, IMREAD_COLOR);
|
||||
Mat destination = imread(original_path2, IMREAD_COLOR);
|
||||
@ -79,8 +80,8 @@ TEST(Photo_SeamlessClone_normal, regression)
|
||||
p.y = destination.size().height/2;
|
||||
seamlessClone(source, destination, mask, p, result, 1);
|
||||
|
||||
|
||||
Mat reference = imread(folder + "reference.png");
|
||||
Mat reference = imread(reference_path);
|
||||
ASSERT_FALSE(reference.empty()) << "Could not load reference image " << reference_path;
|
||||
|
||||
SAVE(result);
|
||||
|
||||
@ -94,6 +95,7 @@ TEST(Photo_SeamlessClone_mixed, regression)
|
||||
string original_path1 = folder + "source1.png";
|
||||
string original_path2 = folder + "destination1.png";
|
||||
string original_path3 = folder + "mask.png";
|
||||
string reference_path = folder + "reference.png";
|
||||
|
||||
Mat source = imread(original_path1, IMREAD_COLOR);
|
||||
Mat destination = imread(original_path2, IMREAD_COLOR);
|
||||
@ -111,7 +113,9 @@ TEST(Photo_SeamlessClone_mixed, regression)
|
||||
|
||||
SAVE(result);
|
||||
|
||||
Mat reference = imread(folder + "reference.png");
|
||||
Mat reference = imread(reference_path);
|
||||
ASSERT_FALSE(reference.empty()) << "Could not load reference image " << reference_path;
|
||||
|
||||
double error = cvtest::norm(reference, result, NORM_L1);
|
||||
EXPECT_LE(error, numerical_precision);
|
||||
|
||||
@ -123,6 +127,7 @@ TEST(Photo_SeamlessClone_featureExchange, regression)
|
||||
string original_path1 = folder + "source1.png";
|
||||
string original_path2 = folder + "destination1.png";
|
||||
string original_path3 = folder + "mask.png";
|
||||
string reference_path = folder + "reference.png";
|
||||
|
||||
Mat source = imread(original_path1, IMREAD_COLOR);
|
||||
Mat destination = imread(original_path2, IMREAD_COLOR);
|
||||
@ -140,7 +145,9 @@ TEST(Photo_SeamlessClone_featureExchange, regression)
|
||||
|
||||
SAVE(result);
|
||||
|
||||
Mat reference = imread(folder + "reference.png");
|
||||
Mat reference = imread(reference_path);
|
||||
ASSERT_FALSE(reference.empty()) << "Could not load reference image " << reference_path;
|
||||
|
||||
double error = cvtest::norm(reference, result, NORM_L1);
|
||||
EXPECT_LE(error, numerical_precision);
|
||||
|
||||
@ -151,6 +158,7 @@ TEST(Photo_SeamlessClone_colorChange, regression)
|
||||
string folder = string(cvtest::TS::ptr()->get_data_path()) + "cloning/color_change/";
|
||||
string original_path1 = folder + "source1.png";
|
||||
string original_path2 = folder + "mask.png";
|
||||
string reference_path = folder + "reference.png";
|
||||
|
||||
Mat source = imread(original_path1, IMREAD_COLOR);
|
||||
Mat mask = imread(original_path2, IMREAD_COLOR);
|
||||
@ -163,7 +171,9 @@ TEST(Photo_SeamlessClone_colorChange, regression)
|
||||
|
||||
SAVE(result);
|
||||
|
||||
Mat reference = imread(folder + "reference.png");
|
||||
Mat reference = imread(reference_path);
|
||||
ASSERT_FALSE(reference.empty()) << "Could not load reference image " << reference_path;
|
||||
|
||||
double error = cvtest::norm(reference, result, NORM_L1);
|
||||
EXPECT_LE(error, numerical_precision);
|
||||
|
||||
@ -174,6 +184,7 @@ TEST(Photo_SeamlessClone_illuminationChange, regression)
|
||||
string folder = string(cvtest::TS::ptr()->get_data_path()) + "cloning/Illumination_Change/";
|
||||
string original_path1 = folder + "source1.png";
|
||||
string original_path2 = folder + "mask.png";
|
||||
string reference_path = folder + "reference.png";
|
||||
|
||||
Mat source = imread(original_path1, IMREAD_COLOR);
|
||||
Mat mask = imread(original_path2, IMREAD_COLOR);
|
||||
@ -186,7 +197,7 @@ TEST(Photo_SeamlessClone_illuminationChange, regression)
|
||||
|
||||
SAVE(result);
|
||||
|
||||
Mat reference = imread(folder + "reference.png");
|
||||
Mat reference = imread(reference_path);
|
||||
double error = cvtest::norm(reference, result, NORM_L1);
|
||||
EXPECT_LE(error, numerical_precision);
|
||||
|
||||
@ -197,6 +208,7 @@ TEST(Photo_SeamlessClone_textureFlattening, regression)
|
||||
string folder = string(cvtest::TS::ptr()->get_data_path()) + "cloning/Texture_Flattening/";
|
||||
string original_path1 = folder + "source1.png";
|
||||
string original_path2 = folder + "mask.png";
|
||||
string reference_path = folder + "reference.png";
|
||||
|
||||
Mat source = imread(original_path1, IMREAD_COLOR);
|
||||
Mat mask = imread(original_path2, IMREAD_COLOR);
|
||||
@ -209,7 +221,9 @@ TEST(Photo_SeamlessClone_textureFlattening, regression)
|
||||
|
||||
SAVE(result);
|
||||
|
||||
Mat reference = imread(folder + "reference.png");
|
||||
Mat reference = imread(reference_path);
|
||||
ASSERT_FALSE(reference.empty()) << "Could not load reference image " << reference_path;
|
||||
|
||||
double error = cvtest::norm(reference, result, NORM_L1);
|
||||
EXPECT_LE(error, numerical_precision);
|
||||
|
||||
|
@ -2998,6 +2998,12 @@ void printVersionInfo(bool useStdOut)
|
||||
|
||||
std::string cpu_features;
|
||||
|
||||
#if CV_POPCNT
|
||||
if (checkHardwareSupport(CV_CPU_POPCNT)) cpu_features += " popcnt";
|
||||
#endif
|
||||
#if CV_MMX
|
||||
if (checkHardwareSupport(CV_CPU_MMX)) cpu_features += " mmx";
|
||||
#endif
|
||||
#if CV_SSE
|
||||
if (checkHardwareSupport(CV_CPU_SSE)) cpu_features += " sse";
|
||||
#endif
|
||||
@ -3019,6 +3025,39 @@ void printVersionInfo(bool useStdOut)
|
||||
#if CV_AVX
|
||||
if (checkHardwareSupport(CV_CPU_AVX)) cpu_features += " avx";
|
||||
#endif
|
||||
#if CV_AVX2
|
||||
if (checkHardwareSupport(CV_CPU_AVX2)) cpu_features += " avx2";
|
||||
#endif
|
||||
#if CV_FMA3
|
||||
if (checkHardwareSupport(CV_CPU_FMA3)) cpu_features += " fma3";
|
||||
#endif
|
||||
#if CV_AVX_512F
|
||||
if (checkHardwareSupport(CV_CPU_AVX_512F) cpu_features += " avx-512f";
|
||||
#endif
|
||||
#if CV_AVX_512BW
|
||||
if (checkHardwareSupport(CV_CPU_AVX_512BW) cpu_features += " avx-512bw";
|
||||
#endif
|
||||
#if CV_AVX_512CD
|
||||
if (checkHardwareSupport(CV_CPU_AVX_512CD) cpu_features += " avx-512cd";
|
||||
#endif
|
||||
#if CV_AVX_512DQ
|
||||
if (checkHardwareSupport(CV_CPU_AVX_512DQ) cpu_features += " avx-512dq";
|
||||
#endif
|
||||
#if CV_AVX_512ER
|
||||
if (checkHardwareSupport(CV_CPU_AVX_512ER) cpu_features += " avx-512er";
|
||||
#endif
|
||||
#if CV_AVX_512IFMA512
|
||||
if (checkHardwareSupport(CV_CPU_AVX_512IFMA512) cpu_features += " avx-512ifma512";
|
||||
#endif
|
||||
#if CV_AVX_512PF
|
||||
if (checkHardwareSupport(CV_CPU_AVX_512PF) cpu_features += " avx-512pf";
|
||||
#endif
|
||||
#if CV_AVX_512VBMI
|
||||
if (checkHardwareSupport(CV_CPU_AVX_512VBMI) cpu_features += " avx-512vbmi";
|
||||
#endif
|
||||
#if CV_AVX_512VL
|
||||
if (checkHardwareSupport(CV_CPU_AVX_512VL) cpu_features += " avx-512vl";
|
||||
#endif
|
||||
#if CV_NEON
|
||||
if (checkHardwareSupport(CV_CPU_NEON)) cpu_features += " neon";
|
||||
#endif
|
||||
|
Loading…
x
Reference in New Issue
Block a user