From a275489f0a5e3fc033ba7221f0829872a0893663 Mon Sep 17 00:00:00 2001 From: Maksim Shabunin Date: Thu, 27 Aug 2015 16:53:44 +0300 Subject: [PATCH 1/2] HAL universal intrinsics tests and documentation --- modules/hal/include/opencv2/hal.hpp | 13 + modules/hal/include/opencv2/hal/defs.h | 20 +- modules/hal/include/opencv2/hal/intrin.hpp | 28 + .../hal/include/opencv2/hal/intrin_cpp.hpp | 1069 +++++++++++++++-- .../hal/include/opencv2/hal/intrin_neon.hpp | 31 +- .../hal/include/opencv2/hal/intrin_sse.hpp | 66 +- modules/hal/test/test_intrin.cpp | 864 +++++++++++++ modules/hal/test/test_intrin_utils.hpp | 234 ++++ modules/hal/test/test_main.cpp | 3 + modules/hal/test/test_precomp.hpp | 11 + 10 files changed, 2226 insertions(+), 113 deletions(-) create mode 100644 modules/hal/test/test_intrin.cpp create mode 100644 modules/hal/test/test_intrin_utils.hpp create mode 100644 modules/hal/test/test_main.cpp create mode 100644 modules/hal/test/test_precomp.hpp diff --git a/modules/hal/include/opencv2/hal.hpp b/modules/hal/include/opencv2/hal.hpp index 95d1ac66c..9d448757d 100644 --- a/modules/hal/include/opencv2/hal.hpp +++ b/modules/hal/include/opencv2/hal.hpp @@ -49,10 +49,21 @@ /** @defgroup hal Hardware Acceleration Layer + @{ + @defgroup hal_intrin Universal intrinsics + @{ + @defgroup hal_intrin_impl Private implementation helpers + @} + @defgroup hal_utils Platform-dependent utils + @} */ + namespace cv { namespace hal { +//! @addtogroup hal +//! @{ + namespace Error { enum @@ -93,6 +104,8 @@ void sqrt(const double* src, double* dst, int len); void invSqrt(const float* src, float* dst, int len); void invSqrt(const double* src, double* dst, int len); +//! @} + }} //cv::hal #endif //__OPENCV_HAL_HPP__ diff --git a/modules/hal/include/opencv2/hal/defs.h b/modules/hal/include/opencv2/hal/defs.h index 1c30073a0..f7d5f3556 100644 --- a/modules/hal/include/opencv2/hal/defs.h +++ b/modules/hal/include/opencv2/hal/defs.h @@ -45,6 +45,9 @@ #ifndef __OPENCV_DEF_H__ #define __OPENCV_DEF_H__ +//! @addtogroup hal_utils +//! @{ + #if !defined _CRT_SECURE_NO_DEPRECATE && defined _MSC_VER && _MSC_VER > 1300 # define _CRT_SECURE_NO_DEPRECATE /* to avoid multiple Visual Studio warnings */ #endif @@ -335,9 +338,6 @@ Cv64suf; # include "tegra_round.hpp" #endif -//! @addtogroup core_utils -//! @{ - #if CV_VFP // 1. general scheme #define ARM_ROUND(_value, _asm_string) \ @@ -567,15 +567,19 @@ CV_INLINE int cvIsInf( float value ) return (ieee754.u & 0x7fffffff) == 0x7f800000; } +//! @} + #include namespace cv { +//! @addtogroup hal_utils +//! @{ + /////////////// saturate_cast (used in image & signal processing) /////////////////// -/** - Template function for accurate conversion from one primitive type to another. +/** @brief Template function for accurate conversion from one primitive type to another. The functions saturate_cast resemble the standard C++ cast operations, such as static_cast\() and others. They perform an efficient and accurate conversion from one primitive type to another @@ -618,8 +622,6 @@ template static inline _Tp saturate_cast(int64 v) { return _Tp( /** @overload */ template static inline _Tp saturate_cast(uint64 v) { return _Tp(v); } -//! @cond IGNORED - template<> inline uchar saturate_cast(schar v) { return (uchar)std::max((int)v, 0); } template<> inline uchar saturate_cast(ushort v) { return (uchar)std::min((unsigned)v, (unsigned)UCHAR_MAX); } template<> inline uchar saturate_cast(int v) { return (uchar)((unsigned)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); } @@ -664,12 +666,10 @@ template<> inline int saturate_cast(double v) { return cvRound(v) template<> inline unsigned saturate_cast(float v) { return cvRound(v); } template<> inline unsigned saturate_cast(double v) { return cvRound(v); } -//! @endcond +//! @} } #endif // __cplusplus -//! @} core_utils - #endif //__OPENCV_HAL_H__ diff --git a/modules/hal/include/opencv2/hal/intrin.hpp b/modules/hal/include/opencv2/hal/intrin.hpp index 767c5780d..c8d59c964 100644 --- a/modules/hal/include/opencv2/hal/intrin.hpp +++ b/modules/hal/include/opencv2/hal/intrin.hpp @@ -48,6 +48,7 @@ #include #include #include +#include "opencv2/hal/defs.h" #define OPENCV_HAL_ADD(a, b) ((a) + (b)) #define OPENCV_HAL_AND(a, b) ((a) & (b)) @@ -59,6 +60,10 @@ // access from within opencv code more accessible namespace cv { +//! @addtogroup hal_intrin +//! @{ + +//! @cond IGNORED template struct V_TypeTraits { typedef _Tp int_type; @@ -82,6 +87,7 @@ template<> struct V_TypeTraits typedef int sum_type; typedef ushort w_type; + typedef unsigned q_type; enum { delta = 128, shift = 8 }; @@ -99,6 +105,7 @@ template<> struct V_TypeTraits typedef int sum_type; typedef short w_type; + typedef int q_type; enum { delta = 128, shift = 8 }; @@ -265,8 +272,22 @@ template<> struct V_TypeTraits } }; +template struct V_SIMD128Traits +{ + enum { nlanes = 16 / sizeof(T) }; +}; + +//! @endcond + +//! @} + } +#ifdef CV_DOXYGEN +# undef CV_SSE2 +# undef CV_NEON +#endif + #if CV_SSE2 #include "opencv2/hal/intrin_sse.hpp" @@ -281,12 +302,19 @@ template<> struct V_TypeTraits #endif +//! @addtogroup hal_intrin +//! @{ + #ifndef CV_SIMD128 +//! Set to 1 if current compiler supports vector extensions (NEON or SSE is enabled) #define CV_SIMD128 0 #endif #ifndef CV_SIMD128_64F +//! Set to 1 if current intrinsics implementation supports 64-bit float vectors #define CV_SIMD128_64F 0 #endif +//! @} + #endif diff --git a/modules/hal/include/opencv2/hal/intrin_cpp.hpp b/modules/hal/include/opencv2/hal/intrin_cpp.hpp index 683305cc2..e1b1044a3 100644 --- a/modules/hal/include/opencv2/hal/intrin_cpp.hpp +++ b/modules/hal/include/opencv2/hal/intrin_cpp.hpp @@ -45,25 +45,233 @@ #ifndef __OPENCV_HAL_INTRIN_CPP_HPP__ #define __OPENCV_HAL_INTRIN_CPP_HPP__ +#include +#include + namespace cv { +/** @addtogroup hal_intrin + +"Universal intrinsics" is a types and functions set intended to simplify vectorization of code on +different platforms. Currently there are two supported SIMD extensions: __SSE/SSE2__ on x86 +architectures and __NEON__ on ARM architectures, both allow working with 128 bit registers +containing packed values of different types. In case when there is no SIMD extension available +during compilation, fallback C++ implementation of intrinsics will be chosen and code will work as +expected although it could be slower. + +### Types + +There are several types representing 128-bit register as a vector of packed values, each type is +implemented as a structure based on a one SIMD register. + +- cv::v_uint8x16 and cv::v_int8x16: sixteen 8-bit integer values (unsigned/signed) - char +- cv::v_uint16x8 and cv::v_int16x8: eight 16-bit integer values (unsigned/signed) - short +- cv::v_uint32x4 and cv::v_int32x4: four 32-bit integer values (unsgined/signed) - int +- cv::v_uint64x2 and cv::v_int64x2: two 64-bit integer values (unsigned/signed) - int64 +- cv::v_float32x4: four 32-bit floating point values (signed) - float +- cv::v_float64x2: two 64-bit floating point valies (signed) - double + +@note +cv::v_float64x2 is not implemented in NEON variant, if you want to use this type, don't forget to +check the CV_SIMD128_64F preprocessor definition: +@code +#if CV_SIMD128_64F +//... +#endif +@endcode + +### Load and store operations + +These operations allow to set contents of the register explicitly or by loading it from some memory +block and to save contents of the register to memory block. + +- Constructors: +@ref v_reg::v_reg(const _Tp *ptr) "from memory", +@ref v_reg::v_reg(_Tp s0, _Tp s1) "from two values", ... +- Other create methods: +@ref v_setall_s8, @ref v_setall_u8, ..., +@ref v_setzero_u8, @ref v_setzero_s8, ... +- Memory operations: +@ref v_load, @ref v_load_aligned, @ref v_load_halves, +@ref v_store, @ref v_store_aligned, +@ref v_store_high, @ref v_store_low + +### Value reordering + +These operations allow to reorder or recombine elements in one or multiple vectors. + +- Interleave, deinterleave (3 and 4 channels): @ref v_load_deinterleave, @ref v_store_interleave +- Expand: @ref v_load_expand, @ref v_load_expand_q, @ref v_expand +- Pack: @ref v_pack, @ref v_pack_u, @ref v_rshr_pack, @ref v_rshr_pack_u, +@ref v_pack_store, @ref v_pack_u_store, @ref v_rshr_pack_store, @ref v_rshr_pack_u_store +- Recombine: @ref v_zip, @ref v_recombine, @ref v_combine_low, @ref v_combine_high +- Extract: @ref v_extract + + +### Arithmetic, bitwise and comparison operations + +Element-wise binary and unary operations. + +- Arithmetics: +@ref operator+(const v_reg &a, const v_reg &b) "+", +@ref operator-(const v_reg &a, const v_reg &b) "-", +@ref operator*(const v_reg &a, const v_reg &b) "*", +@ref operator/(const v_reg &a, const v_reg &b) "/", +@ref v_mul_expand + +- Non-saturating arithmetics: @ref v_add_wrap, @ref v_sub_wrap + +- Bitwise shifts: +@ref operator<<(const v_reg &a, int s) "<<", +@ref operator>>(const v_reg &a, int s) ">>", +@ref v_shl, @ref v_shr + +- Bitwise logic: +@ref operator&(const v_reg &a, const v_reg &b) "&", +@ref operator|(const v_reg &a, const v_reg &b) "|", +@ref operator^(const v_reg &a, const v_reg &b) "^", +@ref operator~(const v_reg &a) "~" + +- Comparison: +@ref operator>(const v_reg &a, const v_reg &b) ">", +@ref operator>=(const v_reg &a, const v_reg &b) ">=", +@ref operator<(const v_reg &a, const v_reg &b) "<", +@ref operator<=(const v_reg &a, const v_reg &b) "<=", +@ref operator==(const v_reg &a, const v_reg &b) "==", +@ref operator!=(const v_reg &a, const v_reg &b) "!=" + +- min/max: @ref v_min, @ref v_max + +### Reduce and mask + +Most of these operations return only one value. + +- Reduce: @ref v_reduce_min, @ref v_reduce_max, @ref v_reduce_sum +- Mask: @ref v_signmask, @ref v_check_all, @ref v_check_any, @ref v_select + +### Other math + +- Some frequent operations: @ref v_sqrt, @ref v_invsqrt, @ref v_magnitude, @ref v_sqr_magnitude +- Absolute values: @ref v_abs, @ref v_absdiff + +### Conversions + +Different type conversions and casts: + +- Rounding: @ref v_round, @ref v_floor, @ref v_ceil, @ref v_trunc, +- To float: @ref v_cvt_f32, @ref v_cvt_f64 +- Reinterpret: @ref v_reinterpret_as_u8, @ref v_reinterpret_as_s8, ... + +### Matrix operations + +In these operations vectors represent matrix rows/columns: @ref v_dotprod, @ref v_matmul, @ref v_transpose4x4 + +### Usability + +Most operations are implemented only for some subset of the available types, following matrices +shows the applicability of different operations to the types. + +Regular integers: + +| Operations\\Types | uint 8x16 | int 8x16 | uint 16x8 | int 16x8 | uint 32x4 | int 32x4 | +|-------------------|:-:|:-:|:-:|:-:|:-:|:-:| +|load, store | x | x | x | x | x | x | +|interleave | x | x | x | x | x | x | +|expand | x | x | x | x | x | x | +|expand_q | x | x | | | | | +|add, sub | x | x | x | x | x | x | +|add_wrap, sub_wrap | x | x | x | x | | | +|mul | | | x | x | x | x | +|mul_expand | | | x | x | x | | +|compare | x | x | x | x | x | x | +|shift | | | x | x | x | x | +|dotprod | | | | x | | | +|logical | x | x | x | x | x | x | +|min, max | x | x | x | x | x | x | +|absdiff | x | x | x | x | x | x | +|reduce | | | | | x | x | +|mask | x | x | x | x | x | x | +|pack | x | x | x | x | x | x | +|pack_u | x | | x | | | | +|unpack | x | x | x | x | x | x | +|extract | x | x | x | x | x | x | +|cvt_flt32 | | | | | | x | +|cvt_flt64 | | | | | | x | +|transpose4x4 | | | | | x | x | + +Big integers: + +| Operations\\Types | uint 64x2 | int 64x2 | +|-------------------|:-:|:-:| +|load, store | x | x | +|add, sub | x | x | +|shift | x | x | +|logical | x | x | +|extract | x | x | + +Floating point: + +| Operations\\Types | float 32x4 | float 64x2 | +|-------------------|:-:|:-:| +|load, store | x | x | +|interleave | x | | +|add, sub | x | x | +|mul | x | x | +|div | x | x | +|compare | x | x | +|min, max | x | x | +|absdiff | x | x | +|reduce | x | | +|mask | x | x | +|unpack | x | x | +|cvt_flt32 | | x | +|cvt_flt64 | x | | +|sqrt, abs | x | x | +|float math | x | x | +|transpose4x4 | x | | + + + @{ */ + template struct v_reg { +//! @cond IGNORED typedef _Tp lane_type; typedef v_reg::int_type, n> int_vec; typedef v_reg::abs_type, n> abs_vec; enum { nlanes = n }; +// !@endcond + /** @brief Constructor + + Initializes register with data from memory + @param ptr pointer to memory block with data for register */ explicit v_reg(const _Tp* ptr) { for( int i = 0; i < n; i++ ) s[i] = ptr[i]; } + + /** @brief Constructor + + Initializes register with two 64-bit values */ v_reg(_Tp s0, _Tp s1) { s[0] = s0; s[1] = s1; } + + /** @brief Constructor + + Initializes register with four 32-bit values */ v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3) { s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; } + + /** @brief Constructor + + Initializes register with eight 16-bit values */ v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3, _Tp s4, _Tp s5, _Tp s6, _Tp s7) { s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7; } + + /** @brief Constructor + + Initializes register with sixteen 8-bit values */ v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3, _Tp s4, _Tp s5, _Tp s6, _Tp s7, _Tp s8, _Tp s9, _Tp s10, _Tp s11, @@ -75,15 +283,31 @@ template struct v_reg s[12] = s12; s[13] = s13; s[14] = s14; s[15] = s15; } + /** @brief Default constructor + + Does not initialize anything*/ v_reg() {} + + /** @brief Copy constructor */ v_reg(const v_reg<_Tp, n> & r) { for( int i = 0; i < n; i++ ) s[i] = r.s[i]; } + /** @brief Access first value - _Tp get(const int i) const { return s[i]; } + Returns value of the first lane according to register type, for example: + @code{.cpp} + v_int32x4 r(1, 2, 3, 4); + int v = r.get0(); // returns 1 + v_uint64x2 r(1, 2); + uint64_t v = r.get0(); // returns 1 + @endcode + */ _Tp get0() const { return s[0]; } + +//! @cond IGNORED + _Tp get(const int i) const { return s[i]; } v_reg<_Tp, n> high() const { v_reg<_Tp, n> c; @@ -116,13 +340,37 @@ template struct v_reg { size_t bytes = std::min(sizeof(_Tp2)*n2, sizeof(_Tp)*n); v_reg<_Tp2, n2> c; - memcpy(&c.s[0], &s[0], bytes); + std::memcpy(&c.s[0], &s[0], bytes); return c; } _Tp s[n]; +//! @endcond }; +/** @brief Sixteen 8-bit unsigned integer values */ +typedef v_reg v_uint8x16; +/** @brief Sixteen 8-bit signed integer values */ +typedef v_reg v_int8x16; +/** @brief Eight 16-bit unsigned integer values */ +typedef v_reg v_uint16x8; +/** @brief Eight 16-bit signed integer values */ +typedef v_reg v_int16x8; +/** @brief Four 32-bit unsigned integer values */ +typedef v_reg v_uint32x4; +/** @brief Four 32-bit signed integer values */ +typedef v_reg v_int32x4; +/** @brief Four 32-bit floating point values (single precision) */ +typedef v_reg v_float32x4; +/** @brief Two 64-bit floating point values (double precision) */ +typedef v_reg v_float64x2; +/** @brief Two 64-bit unsigned integer values */ +typedef v_reg v_uint64x2; +/** @brief Two 64-bit signed integer values */ +typedef v_reg v_int64x2; + +//! @brief Helper macro +//! @ingroup hal_intrin_impl #define OPENCV_HAL_IMPL_BIN_OP(bin_op) \ template inline v_reg<_Tp, n> \ operator bin_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ @@ -140,11 +388,28 @@ template inline v_reg<_Tp, n>& \ return a; \ } +/** @brief Add values + +For all types. */ OPENCV_HAL_IMPL_BIN_OP(+) + +/** @brief Subtract values + +For all types. */ OPENCV_HAL_IMPL_BIN_OP(-) + +/** @brief Multiply values + +For 16- and 32-bit integer types and floating types. */ OPENCV_HAL_IMPL_BIN_OP(*) + +/** @brief Divide values + +For floating types only. */ OPENCV_HAL_IMPL_BIN_OP(/) +//! @brief Helper macro +//! @ingroup hal_intrin_impl #define OPENCV_HAL_IMPL_BIT_OP(bit_op) \ template inline v_reg<_Tp, n> operator bit_op \ (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ @@ -166,10 +431,24 @@ template inline v_reg<_Tp, n>& operator \ return a; \ } +/** @brief Bitwise AND + +Only for integer types. */ OPENCV_HAL_IMPL_BIT_OP(&) + +/** @brief Bitwise OR + +Only for integer types. */ OPENCV_HAL_IMPL_BIT_OP(|) + +/** @brief Bitwise XOR + +Only for integer types.*/ OPENCV_HAL_IMPL_BIT_OP(^) +/** @brief Bitwise NOT + +Only for integer types.*/ template inline v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a) { v_reg<_Tp, n> c; @@ -178,6 +457,8 @@ template inline v_reg<_Tp, n> operator ~ (const v_reg<_Tp, return c; } +//! @brief Helper macro +//! @ingroup hal_intrin_impl #define OPENCV_HAL_IMPL_MATH_FUNC(func, cfunc, _Tp2) \ template inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a) \ { \ @@ -187,27 +468,59 @@ template inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a) return c; \ } +/** @brief Square root of elements + +Only for floating point types.*/ OPENCV_HAL_IMPL_MATH_FUNC(v_sqrt, std::sqrt, _Tp) + +//! @cond IGNORED OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp) OPENCV_HAL_IMPL_MATH_FUNC(v_cos, std::cos, _Tp) OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp) OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp) +//! @endcond + +/** @brief Absolute value of elements + +Only for floating point types.*/ OPENCV_HAL_IMPL_MATH_FUNC(v_abs, (typename V_TypeTraits<_Tp>::abs_type)std::abs, typename V_TypeTraits<_Tp>::abs_type) + +/** @brief Round elements + +Only for floating point types.*/ OPENCV_HAL_IMPL_MATH_FUNC(v_round, cvRound, int) + +/** @brief Floor elements + +Only for floating point types.*/ OPENCV_HAL_IMPL_MATH_FUNC(v_floor, cvFloor, int) + +/** @brief Ceil elements + +Only for floating point types.*/ OPENCV_HAL_IMPL_MATH_FUNC(v_ceil, cvCeil, int) + +/** @brief Truncate elements + +Only for floating point types.*/ OPENCV_HAL_IMPL_MATH_FUNC(v_trunc, int, int) -#define OPENCV_HAL_IMPL_MINMAX_FUNC(func, hfunc, cfunc) \ +//! @brief Helper macro +//! @ingroup hal_intrin_impl +#define OPENCV_HAL_IMPL_MINMAX_FUNC(func, cfunc) \ template inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ { \ v_reg<_Tp, n> c; \ for( int i = 0; i < n; i++ ) \ c.s[i] = cfunc(a.s[i], b.s[i]); \ return c; \ -} \ -template inline _Tp hfunc(const v_reg<_Tp, n>& a) \ +} + +//! @brief Helper macro +//! @ingroup hal_intrin_impl +#define OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(func, cfunc) \ +template inline _Tp func(const v_reg<_Tp, n>& a) \ { \ _Tp c = a.s[0]; \ for( int i = 1; i < n; i++ ) \ @@ -215,9 +528,49 @@ template inline _Tp hfunc(const v_reg<_Tp, n>& a) \ return c; \ } -OPENCV_HAL_IMPL_MINMAX_FUNC(v_min, v_reduce_min, std::min) -OPENCV_HAL_IMPL_MINMAX_FUNC(v_max, v_reduce_max, std::max) +/** @brief Choose min values for each pair +Scheme: +@code +{A1 A2 ...} +{B1 B2 ...} +-------------- +{min(A1,B1) min(A2,B2) ...} +@endcode +For all types except 64-bit integer. */ +OPENCV_HAL_IMPL_MINMAX_FUNC(v_min, std::min) + +/** @brief Choose max values for each pair + +Scheme: +@code +{A1 A2 ...} +{B1 B2 ...} +-------------- +{max(A1,B1) max(A2,B2) ...} +@endcode +For all types except 64-bit integer. */ +OPENCV_HAL_IMPL_MINMAX_FUNC(v_max, std::max) + +/** @brief Find one min value + +Scheme: +@code +{A1 A2 A3 ...} => min(A1,A2,A3,...) +@endcode +For 32-bit integer and 32-bit floating point types. */ +OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_min, std::min) + +/** @brief Find one max value + +Scheme: +@code +{A1 A2 A3 ...} => max(A1,A2,A3,...) +@endcode +For 32-bit integer and 32-bit floating point types. */ +OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_max, std::max) + +//! @cond IGNORED template inline void v_minmax( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, v_reg<_Tp, n>& minval, v_reg<_Tp, n>& maxval ) @@ -228,8 +581,10 @@ inline void v_minmax( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, maxval.s[i] = std::max(a.s[i], b.s[i]); } } +//! @endcond - +//! @brief Helper macro +//! @ingroup hal_intrin_impl #define OPENCV_HAL_IMPL_CMP_OP(cmp_op) \ template \ inline v_reg<_Tp, n> operator cmp_op(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ @@ -241,13 +596,38 @@ inline v_reg<_Tp, n> operator cmp_op(const v_reg<_Tp, n>& a, const v_reg<_Tp, n> return c; \ } +/** @brief Less-than comparison + +For all types except 64-bit integer values. */ OPENCV_HAL_IMPL_CMP_OP(<) + +/** @brief Greater-than comparison + +For all types except 64-bit integer values. */ OPENCV_HAL_IMPL_CMP_OP(>) + +/** @brief Less-than or equal comparison + +For all types except 64-bit integer values. */ OPENCV_HAL_IMPL_CMP_OP(<=) + +/** @brief Greater-than or equal comparison + +For all types except 64-bit integer values. */ OPENCV_HAL_IMPL_CMP_OP(>=) + +/** @brief Equal comparison + +For all types except 64-bit integer values. */ OPENCV_HAL_IMPL_CMP_OP(==) + +/** @brief Not equal comparison + +For all types except 64-bit integer values. */ OPENCV_HAL_IMPL_CMP_OP(!=) +//! @brief Helper macro +//! @ingroup hal_intrin_impl #define OPENCV_HAL_IMPL_ADD_SUB_OP(func, bin_op, cast_op, _Tp2) \ template \ inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ @@ -259,10 +639,73 @@ inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ return c; \ } -OPENCV_HAL_IMPL_ADD_SUB_OP(v_add_wrap, +, (_Tp), _Tp) -OPENCV_HAL_IMPL_ADD_SUB_OP(v_sub_wrap, -, (_Tp), _Tp) -OPENCV_HAL_IMPL_ADD_SUB_OP(v_absdiff, -, (rtype)std::abs, typename V_TypeTraits<_Tp>::abs_type) +/** @brief Add values without saturation +For 8- and 16-bit integer values. */ +OPENCV_HAL_IMPL_ADD_SUB_OP(v_add_wrap, +, (_Tp), _Tp) + +/** @brief Subtract values without saturation + +For 8- and 16-bit integer values. */ +OPENCV_HAL_IMPL_ADD_SUB_OP(v_sub_wrap, -, (_Tp), _Tp) + +//! @cond IGNORED +template inline T _absdiff(T a, T b) +{ + return a > b ? a - b : b - a; +} +//! @endcond + +/** @brief Absolute difference + +Returns \f$ |a - b| \f$ converted to corresponding unsigned type. +Example: +@code{.cpp} +v_int32x4 a, b; // {1, 2, 3, 4} and {4, 3, 2, 1} +v_uint32x4 c = v_absdiff(a, b); // result is {3, 1, 1, 3} +@endcode +For 8-, 16-, 32-bit integer source types. */ +template +inline v_reg::abs_type, n> v_absdiff(const v_reg<_Tp, n>& a, const v_reg<_Tp, n> & b) +{ + typedef typename V_TypeTraits<_Tp>::abs_type rtype; + v_reg c; + const rtype mask = std::numeric_limits<_Tp>::is_signed ? (1 << (sizeof(rtype)*8 - 1)) : 0; + for( int i = 0; i < n; i++ ) + { + rtype ua = a.s[i] ^ mask; + rtype ub = b.s[i] ^ mask; + c.s[i] = _absdiff(ua, ub); + } + return c; +} + +/** @overload + +For 32-bit floating point values */ +inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b) +{ + v_float32x4 c; + for( int i = 0; i < c.nlanes; i++ ) + c.s[i] = _absdiff(a.s[i], b.s[i]); + return c; +} + +/** @overload + +For 64-bit floating point values */ +inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b) +{ + v_float64x2 c; + for( int i = 0; i < c.nlanes; i++ ) + c.s[i] = _absdiff(a.s[i], b.s[i]); + return c; +} + +/** @brief Inversed square root + +Returns \f$ 1/sqrt(a) \f$ +For floating point types only. */ template inline v_reg<_Tp, n> v_invsqrt(const v_reg<_Tp, n>& a) { @@ -272,6 +715,10 @@ inline v_reg<_Tp, n> v_invsqrt(const v_reg<_Tp, n>& a) return c; } +/** @brief Magnitude + +Returns \f$ sqrt(a^2 + b^2) \f$ +For floating point types only. */ template inline v_reg<_Tp, n> v_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) { @@ -281,7 +728,10 @@ inline v_reg<_Tp, n> v_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) return c; } +/** @brief Square of the magnitude +Returns \f$ a^2 + b^2 \f$ +For floating point types only. */ template inline v_reg<_Tp, n> v_sqr_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) { @@ -291,6 +741,10 @@ inline v_reg<_Tp, n> v_sqr_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n> return c; } +/** @brief Multiply and add + +Returns \f$ a*b + c \f$ +For floating point types only. */ template inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c) @@ -301,6 +755,18 @@ inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, return d; } +/** @brief Dot product of elements + +Multiply values in two registers and sum adjacent result pairs. +Scheme: +@code + {A1 A2 ...} // 16-bit +x {B1 B2 ...} // 16-bit +------------- +{A1B1+A2B2 ...} // 32-bit +@endcode +Implemented only for 16-bit signed source type (v_int16x8). +*/ template inline v_reg::w_type, n/2> v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) { @@ -311,6 +777,25 @@ template inline v_reg::w_type, n return c; } +/** @brief Multiply and expand + +Multiply values two registers and store results in two registers with wider pack type. +Scheme: +@code + {A B C D} // 32-bit +x {E F G H} // 32-bit +--------------- +{AE BF} // 64-bit + {CG DH} // 64-bit +@endcode +Example: +@code{.cpp} +v_uint32x4 a, b; // {1,2,3,4} and {2,2,2,2} +v_uint64x2 c, d; // results +v_mul_expand(a, b, c, d); // c, d = {2,4}, {6, 8} +@endcode +Implemented only for 16- and unsigned 32-bit source types (v_int16x8, v_uint16x8, v_uint32x4). +*/ template inline void v_mul_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, v_reg::w_type, n/2>& c, v_reg::w_type, n/2>& d) @@ -318,11 +803,12 @@ template inline void v_mul_expand(const v_reg<_Tp, n>& a, c typedef typename V_TypeTraits<_Tp>::w_type w_type; for( int i = 0; i < (n/2); i++ ) { - c.s[i] = (w_type)a.s[i]*b.s[i]*2; + c.s[i] = (w_type)a.s[i]*b.s[i]; d.s[i] = (w_type)a.s[i+(n/2)]*b.s[i+(n/2)]; } } +//! @cond IGNORED template inline void v_hsum(const v_reg<_Tp, n>& a, v_reg::w_type, n/2>& c) { @@ -332,7 +818,10 @@ template inline void v_hsum(const v_reg<_Tp, n>& a, c.s[i] = (w_type)a.s[i*2] + a.s[i*2+1]; } } +//! @endcond +//! @brief Helper macro +//! @ingroup hal_intrin_impl #define OPENCV_HAL_IMPL_SHIFT_OP(shift_op) \ template inline v_reg<_Tp, n> operator shift_op(const v_reg<_Tp, n>& a, int imm) \ { \ @@ -342,9 +831,23 @@ template inline v_reg<_Tp, n> operator shift_op(const v_reg return c; \ } +/** @brief Bitwise shift left + +For 16-, 32- and 64-bit integer values. */ OPENCV_HAL_IMPL_SHIFT_OP(<<) + +/** @brief Bitwise shift right + +For 16-, 32- and 64-bit integer values. */ OPENCV_HAL_IMPL_SHIFT_OP(>>) +/** @brief Sum packed values + +Scheme: +@code +{A1 A2 A3 ...} => sum{A1,A2,A3,...} +@endcode +For 32-bit integer and 32-bit floating point types.*/ template inline typename V_TypeTraits<_Tp>::sum_type v_reduce_sum(const v_reg<_Tp, n>& a) { typename V_TypeTraits<_Tp>::sum_type c = a.s[0]; @@ -353,6 +856,15 @@ template inline typename V_TypeTraits<_Tp>::sum_type v_redu return c; } +/** @brief Get negative values mask + +Returned value is a bit mask with bits set to 1 on places corresponding to negative packed values indexes. +Example: +@code{.cpp} +v_int32x4 r; // set to {-1, -1, 1, 1} +int mask = v_signmask(r); // mask = 3 <== 00000000 00000000 00000000 00000011 +@endcode +For all types except 64-bit. */ template inline int v_signmask(const v_reg<_Tp, n>& a) { int mask = 0; @@ -361,6 +873,10 @@ template inline int v_signmask(const v_reg<_Tp, n>& a) return mask; } +/** @brief Check if all packed values are less than zero + +Unsigned values will be casted to signed: `uchar 254 => char -2`. +For all types except 64-bit. */ template inline bool v_check_all(const v_reg<_Tp, n>& a) { for( int i = 0; i < n; i++ ) @@ -369,6 +885,10 @@ template inline bool v_check_all(const v_reg<_Tp, n>& a) return true; } +/** @brief Check if any of packed values is less than zero + +Unsigned values will be casted to signed: `uchar 254 => char -2`. +For all types except 64-bit. */ template inline bool v_check_any(const v_reg<_Tp, n>& a) { for( int i = 0; i < n; i++ ) @@ -377,15 +897,36 @@ template inline bool v_check_any(const v_reg<_Tp, n>& a) return false; } +/** @brief Bitwise select + +Return value will be built by combining values a and b using the following scheme: +If the i-th bit in _mask_ is 1 + select i-th bit from _a_ +else + select i-th bit from _b_ */ template inline v_reg<_Tp, n> v_select(const v_reg<_Tp, n>& mask, const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) { + typedef V_TypeTraits<_Tp> Traits; + typedef typename Traits::int_type int_type; v_reg<_Tp, n> c; for( int i = 0; i < n; i++ ) - c.s[i] = V_TypeTraits<_Tp>::reinterpret_int(mask.s[i]) < 0 ? b.s[i] : a.s[i]; + { + int_type m = Traits::reinterpret_int(mask.s[i]); + c.s[i] = Traits::reinterpret_from_int((Traits::reinterpret_int(a.s[i]) & m) + | (Traits::reinterpret_int(b.s[i]) & ~m)); + } return c; } +/** @brief Expand values to the wider pack type + +Copy contents of register to two registers with 2x wider pack type. +Scheme: +@code + int32x4 int64x2 int64x2 +{A B C D} ==> {A B} , {C D} +@endcode */ template inline void v_expand(const v_reg<_Tp, n>& a, v_reg::w_type, n/2>& b0, v_reg::w_type, n/2>& b1) @@ -397,6 +938,7 @@ template inline void v_expand(const v_reg<_Tp, n>& a, } } +//! @cond IGNORED template inline v_reg::int_type, n> v_reinterpret_as_int(const v_reg<_Tp, n>& a) { @@ -414,7 +956,19 @@ template inline v_reg::uint_type c.s[i] = V_TypeTraits<_Tp>::reinterpret_uint(a.s[i]); return c; } +//! @endcond +/** @brief Interleave two vectors + +Scheme: +@code + {A1 A2 A3 A4} + {B1 B2 B3 B4} +--------------- + {A1 B1 A2 B2} and {A3 B3 A4 B4} +@endcode +For all types except 64-bit. +*/ template inline void v_zip( const v_reg<_Tp, n>& a0, const v_reg<_Tp, n>& a1, v_reg<_Tp, n>& b0, v_reg<_Tp, n>& b1 ) { @@ -431,50 +985,102 @@ template inline void v_zip( const v_reg<_Tp, n>& a0, const } } -template inline v_reg<_Tp, n> v_load(const _Tp* ptr) +/** @brief Load register contents from memory + +@param ptr pointer to memory block with data +@return register object + +@note Returned type will be detected from passed pointer type, for example uchar ==> cv::v_uint8x16, int ==> cv::v_int32x4, etc. + */ +template +inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load(const _Tp* ptr) { - return v_reg<_Tp, n>(ptr); + return v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes>(ptr); } -template inline v_reg<_Tp, n> v_load_aligned(const _Tp* ptr) +/** @brief Load register contents from memory (aligned) + +similar to cv::v_load, but source memory block should be aligned (to 16-byte boundary) + */ +template +inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load_aligned(const _Tp* ptr) { - return v_reg<_Tp, n>(ptr); + return v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes>(ptr); } -template inline void v_load_halves(const _Tp* loptr, const _Tp* hiptr) +/** @brief Load register contents from two memory blocks + +@param loptr memory block containing data for first half (0..n/2) +@param hiptr memory block containing data for second half (n/2..n) + +@code{.cpp} +int lo[2] = { 1, 2 }, hi[2] = { 3, 4 }; +v_int32x4 r = v_load_halves(lo, hi); +@endcode + */ +template +inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load_halves(const _Tp* loptr, const _Tp* hiptr) { - v_reg<_Tp, n> c; - for( int i = 0; i < n/2; i++ ) + v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> c; + for( int i = 0; i < c.nlanes/2; i++ ) { c.s[i] = loptr[i]; - c.s[i+n/2] = hiptr[i]; + c.s[i+c.nlanes/2] = hiptr[i]; } return c; } -template inline v_reg::w_type, n> v_load_expand(const _Tp* ptr) +/** @brief Load register contents from memory with double expand + +Same as cv::v_load, but result pack type will be 2x wider than memory type. + +@code{.cpp} +short buf[4] = {1, 2, 3, 4}; // type is int16 +v_int32x4 r = v_load_expand(buf); // r = {1, 2, 3, 4} - type is int32 +@endcode +For 8-, 16-, 32-bit integer source types. */ +template +inline v_reg::w_type, V_SIMD128Traits<_Tp>::nlanes / 2> +v_load_expand(const _Tp* ptr) { typedef typename V_TypeTraits<_Tp>::w_type w_type; - v_reg c; - for( int i = 0; i < n; i++ ) + v_reg::nlanes> c; + for( int i = 0; i < c.nlanes; i++ ) { c.s[i] = ptr[i]; } return c; } -template inline v_reg::w_type>::w_type, n> v_load_expand_q(const _Tp* ptr) +/** @brief Load register contents from memory with quad expand + +Same as cv::v_load_expand, but result type is 4 times wider than source. +@code{.cpp} +char buf[4] = {1, 2, 3, 4}; // type is int8 +v_int32x4 r = v_load_q(buf); // r = {1, 2, 3, 4} - type is int32 +@endcode +For 8-bit integer source types. */ +template +inline v_reg::q_type, V_SIMD128Traits<_Tp>::nlanes / 4> +v_load_expand_q(const _Tp* ptr) { - typedef typename V_TypeTraits::w_type>::w_type w_type; - v_reg c; - for( int i = 0; i < n; i++ ) + typedef typename V_TypeTraits<_Tp>::q_type q_type; + v_reg::nlanes> c; + for( int i = 0; i < c.nlanes; i++ ) { c.s[i] = ptr[i]; } return c; } +/** @brief Load and deinterleave (4 channels) + +Load data from memory deinterleave and store to 4 registers. +Scheme: +@code +{A1 B1 C1 D1 A2 B2 C2 D2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...} +@endcode +For all types except 64-bit. */ template inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a, v_reg<_Tp, n>& b, v_reg<_Tp, n>& c) { @@ -487,6 +1093,14 @@ template inline void v_load_deinterleave(const _Tp* ptr, v_ } } +/** @brief Load and deinterleave (3 channels) + +Load data from memory deinterleave and store to 3 registers. +Scheme: +@code +{A1 B1 C1 A2 B2 C2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...} +@endcode +For all types except 64-bit. */ template inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a, v_reg<_Tp, n>& b, v_reg<_Tp, n>& c, @@ -502,6 +1116,14 @@ inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a, } } +/** @brief Interleave and store (3 channels) + +Interleave and store data from 3 registers to memory. +Scheme: +@code +{A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...} ==> {A1 B1 C1 D1 A2 B2 C2 D2 ...} +@endcode +For all types except 64-bit. */ template inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c) @@ -515,6 +1137,14 @@ inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a, } } +/** @brief Interleave and store (4 channels) + +Interleave and store data from 4 registers to memory. +Scheme: +@code +{A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...} ==> {A1 B1 C1 D1 A2 B2 C2 D2 ...} +@endcode +For all types except 64-bit. */ template inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c, const v_reg<_Tp, n>& d) @@ -529,6 +1159,14 @@ template inline void v_store_interleave( _Tp* ptr, const v_ } } +/** @brief Store data to memory + +Store register contents to memory. +Scheme: +@code + REG {A B C D} ==> MEM {A B C D} +@endcode +Pointer can be unaligned. */ template inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a) { @@ -536,6 +1174,13 @@ inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a) ptr[i] = a.s[i]; } +/** @brief Store data to memory (lower half) + +Store lower half of register contents to memory. +Scheme: +@code + REG {A B C D} ==> MEM {A B} +@endcode */ template inline void v_store_low(_Tp* ptr, const v_reg<_Tp, n>& a) { @@ -543,6 +1188,13 @@ inline void v_store_low(_Tp* ptr, const v_reg<_Tp, n>& a) ptr[i] = a.s[i]; } +/** @brief Store data to memory (higher half) + +Store higher half of register contents to memory. +Scheme: +@code + REG {A B C D} ==> MEM {C D} +@endcode */ template inline void v_store_high(_Tp* ptr, const v_reg<_Tp, n>& a) { @@ -550,6 +1202,14 @@ inline void v_store_high(_Tp* ptr, const v_reg<_Tp, n>& a) ptr[i] = a.s[i+(n/2)]; } +/** @brief Store data to memory (aligned) + +Store register contents to memory. +Scheme: +@code + REG {A B C D} ==> MEM {A B C D} +@endcode +Pointer __should__ be aligned by 16-byte boundary. */ template inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a) { @@ -557,6 +1217,16 @@ inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a) ptr[i] = a.s[i]; } +/** @brief Combine vector from first elements of two vectors + +Scheme: +@code + {A1 A2 A3 A4} + {B1 B2 B3 B4} +--------------- + {A1 A2 B1 B2} +@endcode +For all types except 64-bit. */ template inline v_reg<_Tp, n> v_combine_low(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) { @@ -569,6 +1239,16 @@ inline v_reg<_Tp, n> v_combine_low(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& return c; } +/** @brief Combine vector from last elements of two vectors + +Scheme: +@code + {A1 A2 A3 A4} + {B1 B2 B3 B4} +--------------- + {A3 A4 B3 B4} +@endcode +For all types except 64-bit. */ template inline v_reg<_Tp, n> v_combine_high(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) { @@ -581,6 +1261,12 @@ inline v_reg<_Tp, n> v_combine_high(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& return c; } +/** @brief Combine two vectors from lower and higher parts of two other vectors + +@code{.cpp} +low = cv::v_combine_low(a, b); +high = cv::v_combine_high(a, b); +@endcode */ template inline void v_recombine(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, v_reg<_Tp, n>& low, v_reg<_Tp, n>& high) @@ -594,18 +1280,41 @@ inline void v_recombine(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, } } +/** @brief Vector extract + +Scheme: +@code + {A1 A2 A3 A4} + {B1 B2 B3 B4} +======================== +shift = 1 {A2 A3 A4 B1} +shift = 2 {A3 A4 B1 B2} +shift = 3 {A4 B1 B2 B3} +@endcode +Restriction: 0 <= shift < nlanes + +Usage: +@code +v_int32x4 a, b, c; +c = v_extract<2>(a, b); +@endcode +For integer types only. */ template inline v_reg<_Tp, n> v_extract(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) { v_reg<_Tp, n> r; + const int shift = n - s; int i = 0; - for (; i < s; ++i) - r.s[i] = a.s[i+n-s]; + for (; i < shift; ++i) + r.s[i] = a.s[i+s]; for (; i < n; ++i) - r.s[i] = b.s[i-s]; + r.s[i] = b.s[i-shift]; return r; } +/** @brief Round + +Rounds each value. Input type is float vector ==> output type is int vector.*/ template inline v_reg v_round(const v_reg& a) { v_reg c; @@ -614,6 +1323,9 @@ template inline v_reg v_round(const v_reg& a) return c; } +/** @brief Floor + +Floor each value. Input type is float vector ==> output type is int vector.*/ template inline v_reg v_floor(const v_reg& a) { v_reg c; @@ -622,6 +1334,9 @@ template inline v_reg v_floor(const v_reg& a) return c; } +/** @brief Ceil + +Ceil each value. Input type is float vector ==> output type is int vector.*/ template inline v_reg v_ceil(const v_reg& a) { v_reg c; @@ -630,6 +1345,9 @@ template inline v_reg v_ceil(const v_reg& a) return c; } +/** @brief Trunc + +Truncate each value. Input type is float vector ==> output type is int vector.*/ template inline v_reg v_trunc(const v_reg& a) { v_reg c; @@ -638,6 +1356,7 @@ template inline v_reg v_trunc(const v_reg& a) return c; } +/** @overload */ template inline v_reg v_round(const v_reg& a) { v_reg c; @@ -649,6 +1368,7 @@ template inline v_reg v_round(const v_reg& a) return c; } +/** @overload */ template inline v_reg v_floor(const v_reg& a) { v_reg c; @@ -660,6 +1380,7 @@ template inline v_reg v_floor(const v_reg& a) return c; } +/** @overload */ template inline v_reg v_ceil(const v_reg& a) { v_reg c; @@ -671,6 +1392,7 @@ template inline v_reg v_ceil(const v_reg& a) return c; } +/** @overload */ template inline v_reg v_trunc(const v_reg& a) { v_reg c; @@ -682,6 +1404,9 @@ template inline v_reg v_trunc(const v_reg& a) return c; } +/** @brief Convert to float + +Supported input type is cv::v_int32x4. */ template inline v_reg v_cvt_f32(const v_reg& a) { v_reg c; @@ -690,6 +1415,9 @@ template inline v_reg v_cvt_f32(const v_reg& a) return c; } +/** @brief Convert to double + +Supported input type is cv::v_int32x4. */ template inline v_reg v_cvt_f64(const v_reg& a) { v_reg c; @@ -698,6 +1426,9 @@ template inline v_reg v_cvt_f64(const v_reg& a) return c; } +/** @brief Convert to double + +Supported input type is cv::v_float32x4. */ template inline v_reg v_cvt_f64(const v_reg& a) { v_reg c; @@ -706,6 +1437,21 @@ template inline v_reg v_cvt_f64(const v_reg& a) return c; } +/** @brief Transpose 4x4 matrix + +Scheme: +@code +a0 {A1 A2 A3 A4} +a1 {B1 B2 B3 B4} +a2 {C1 C2 C3 C4} +a3 {D1 D2 D3 D4} +=============== +b0 {A1 B1 C1 D1} +b1 {A2 B2 C2 D2} +b2 {A3 B3 C3 D3} +b3 {A4 B4 C4 D4} +@endcode +*/ template inline void v_transpose4x4( v_reg<_Tp, 4>& a0, const v_reg<_Tp, 4>& a1, const v_reg<_Tp, 4>& a2, const v_reg<_Tp, 4>& a3, @@ -718,41 +1464,105 @@ inline void v_transpose4x4( v_reg<_Tp, 4>& a0, const v_reg<_Tp, 4>& a1, b3 = v_reg<_Tp, 4>(a0.s[3], a1.s[3], a2.s[3], a3.s[3]); } -typedef v_reg v_uint8x16; -typedef v_reg v_int8x16; -typedef v_reg v_uint16x8; -typedef v_reg v_int16x8; -typedef v_reg v_uint32x4; -typedef v_reg v_int32x4; -typedef v_reg v_float32x4; -typedef v_reg v_float32x8; -typedef v_reg v_float64x2; -typedef v_reg v_uint64x2; -typedef v_reg v_int64x2; +//! @brief Helper macro +//! @ingroup hal_intrin_impl +#define OPENCV_HAL_IMPL_C_INIT_ZERO(_Tpvec, _Tp, suffix) \ +inline _Tpvec v_setzero_##suffix() { return _Tpvec::zero(); } -#define OPENCV_HAL_IMPL_C_INIT(_Tpvec, _Tp, suffix) \ -inline _Tpvec v_setzero_##suffix() { return _Tpvec::zero(); } \ -inline _Tpvec v_setall_##suffix(_Tp val) { return _Tpvec::all(val); } \ +//! @name Init with zero +//! @{ +//! @brief Create new vector with zero elements +OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint8x16, uchar, u8) +OPENCV_HAL_IMPL_C_INIT_ZERO(v_int8x16, schar, s8) +OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint16x8, ushort, u16) +OPENCV_HAL_IMPL_C_INIT_ZERO(v_int16x8, short, s16) +OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint32x4, unsigned, u32) +OPENCV_HAL_IMPL_C_INIT_ZERO(v_int32x4, int, s32) +OPENCV_HAL_IMPL_C_INIT_ZERO(v_float32x4, float, f32) +OPENCV_HAL_IMPL_C_INIT_ZERO(v_float64x2, double, f64) +OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint64x2, uint64, u64) +OPENCV_HAL_IMPL_C_INIT_ZERO(v_int64x2, int64, s64) +//! @} + +//! @brief Helper macro +//! @ingroup hal_intrin_impl +#define OPENCV_HAL_IMPL_C_INIT_VAL(_Tpvec, _Tp, suffix) \ +inline _Tpvec v_setall_##suffix(_Tp val) { return _Tpvec::all(val); } + +//! @name Init with value +//! @{ +//! @brief Create new vector with elements set to a specific value +OPENCV_HAL_IMPL_C_INIT_VAL(v_uint8x16, uchar, u8) +OPENCV_HAL_IMPL_C_INIT_VAL(v_int8x16, schar, s8) +OPENCV_HAL_IMPL_C_INIT_VAL(v_uint16x8, ushort, u16) +OPENCV_HAL_IMPL_C_INIT_VAL(v_int16x8, short, s16) +OPENCV_HAL_IMPL_C_INIT_VAL(v_uint32x4, unsigned, u32) +OPENCV_HAL_IMPL_C_INIT_VAL(v_int32x4, int, s32) +OPENCV_HAL_IMPL_C_INIT_VAL(v_float32x4, float, f32) +OPENCV_HAL_IMPL_C_INIT_VAL(v_float64x2, double, f64) +OPENCV_HAL_IMPL_C_INIT_VAL(v_uint64x2, uint64, u64) +OPENCV_HAL_IMPL_C_INIT_VAL(v_int64x2, int64, s64) +//! @} + +//! @brief Helper macro +//! @ingroup hal_intrin_impl +#define OPENCV_HAL_IMPL_C_REINTERPRET(_Tpvec, _Tp, suffix) \ template inline _Tpvec \ v_reinterpret_as_##suffix(const v_reg<_Tp0, n0>& a) \ -{ return a.template reinterpret_as<_Tp, _Tpvec::nlanes>(a); } +{ return a.template reinterpret_as<_Tp, _Tpvec::nlanes>(); } -OPENCV_HAL_IMPL_C_INIT(v_uint8x16, uchar, u8) -OPENCV_HAL_IMPL_C_INIT(v_int8x16, schar, s8) -OPENCV_HAL_IMPL_C_INIT(v_uint16x8, ushort, u16) -OPENCV_HAL_IMPL_C_INIT(v_int16x8, short, s16) -OPENCV_HAL_IMPL_C_INIT(v_uint32x4, unsigned, u32) -OPENCV_HAL_IMPL_C_INIT(v_int32x4, int, s32) -OPENCV_HAL_IMPL_C_INIT(v_float32x4, float, f32) -OPENCV_HAL_IMPL_C_INIT(v_float64x2, double, f64) -OPENCV_HAL_IMPL_C_INIT(v_uint64x2, uint64, u64) -OPENCV_HAL_IMPL_C_INIT(v_uint64x2, int64, s64) +//! @name Reinterpret +//! @{ +//! @brief Convert vector to different type without modifying underlying data. +OPENCV_HAL_IMPL_C_REINTERPRET(v_uint8x16, uchar, u8) +OPENCV_HAL_IMPL_C_REINTERPRET(v_int8x16, schar, s8) +OPENCV_HAL_IMPL_C_REINTERPRET(v_uint16x8, ushort, u16) +OPENCV_HAL_IMPL_C_REINTERPRET(v_int16x8, short, s16) +OPENCV_HAL_IMPL_C_REINTERPRET(v_uint32x4, unsigned, u32) +OPENCV_HAL_IMPL_C_REINTERPRET(v_int32x4, int, s32) +OPENCV_HAL_IMPL_C_REINTERPRET(v_float32x4, float, f32) +OPENCV_HAL_IMPL_C_REINTERPRET(v_float64x2, double, f64) +OPENCV_HAL_IMPL_C_REINTERPRET(v_uint64x2, uint64, u64) +OPENCV_HAL_IMPL_C_REINTERPRET(v_int64x2, int64, s64) +//! @} -#define OPENCV_HAL_IMPL_C_SHIFT(_Tpvec, _Tp) \ +//! @brief Helper macro +//! @ingroup hal_intrin_impl +#define OPENCV_HAL_IMPL_C_SHIFTL(_Tpvec, _Tp) \ template inline _Tpvec v_shl(const _Tpvec& a) \ -{ return a << n; } \ +{ return a << n; } + +//! @name Left shift +//! @{ +//! @brief Shift left +OPENCV_HAL_IMPL_C_SHIFTL(v_uint16x8, ushort) +OPENCV_HAL_IMPL_C_SHIFTL(v_int16x8, short) +OPENCV_HAL_IMPL_C_SHIFTL(v_uint32x4, unsigned) +OPENCV_HAL_IMPL_C_SHIFTL(v_int32x4, int) +OPENCV_HAL_IMPL_C_SHIFTL(v_uint64x2, uint64) +OPENCV_HAL_IMPL_C_SHIFTL(v_int64x2, int64) +//! @} + +//! @brief Helper macro +//! @ingroup hal_intrin_impl +#define OPENCV_HAL_IMPL_C_SHIFTR(_Tpvec, _Tp) \ template inline _Tpvec v_shr(const _Tpvec& a) \ -{ return a >> n; } \ +{ return a >> n; } + +//! @name Right shift +//! @{ +//! @brief Shift right +OPENCV_HAL_IMPL_C_SHIFTR(v_uint16x8, ushort) +OPENCV_HAL_IMPL_C_SHIFTR(v_int16x8, short) +OPENCV_HAL_IMPL_C_SHIFTR(v_uint32x4, unsigned) +OPENCV_HAL_IMPL_C_SHIFTR(v_int32x4, int) +OPENCV_HAL_IMPL_C_SHIFTR(v_uint64x2, uint64) +OPENCV_HAL_IMPL_C_SHIFTR(v_int64x2, int64) +//! @} + +//! @brief Helper macro +//! @ingroup hal_intrin_impl +#define OPENCV_HAL_IMPL_C_RSHIFTR(_Tpvec, _Tp) \ template inline _Tpvec v_rshr(const _Tpvec& a) \ { \ _Tpvec c; \ @@ -761,15 +1571,20 @@ template inline _Tpvec v_rshr(const _Tpvec& a) \ return c; \ } -OPENCV_HAL_IMPL_C_SHIFT(v_uint16x8, ushort) -OPENCV_HAL_IMPL_C_SHIFT(v_int16x8, short) -OPENCV_HAL_IMPL_C_SHIFT(v_uint32x4, unsigned) -OPENCV_HAL_IMPL_C_SHIFT(v_int32x4, int) -OPENCV_HAL_IMPL_C_SHIFT(v_uint64x2, uint64) -OPENCV_HAL_IMPL_C_SHIFT(v_int64x2, int64) +//! @name Rounding shift +//! @{ +//! @brief Rounding shift right +OPENCV_HAL_IMPL_C_RSHIFTR(v_uint16x8, ushort) +OPENCV_HAL_IMPL_C_RSHIFTR(v_int16x8, short) +OPENCV_HAL_IMPL_C_RSHIFTR(v_uint32x4, unsigned) +OPENCV_HAL_IMPL_C_RSHIFTR(v_int32x4, int) +OPENCV_HAL_IMPL_C_RSHIFTR(v_uint64x2, uint64) +OPENCV_HAL_IMPL_C_RSHIFTR(v_int64x2, int64) +//! @} - -#define OPENCV_HAL_IMPL_C_PACK(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix) \ +//! @brief Helper macro +//! @ingroup hal_intrin_impl +#define OPENCV_HAL_IMPL_C_PACK(_Tpvec, _Tpnvec, _Tpn, pack_suffix) \ inline _Tpnvec v_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \ { \ _Tpnvec c; \ @@ -779,7 +1594,30 @@ inline _Tpnvec v_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \ c.s[i+_Tpvec::nlanes] = saturate_cast<_Tpn>(b.s[i]); \ } \ return c; \ -} \ +} + +//! @name Pack +//! @{ +//! @brief Pack values from two vectors to one +//! +//! Return vector type have twice more elements than input vector types. Variant with _u_ suffix also +//! converts to corresponding unsigned type. +//! +//! - pack: for 16-, 32- and 64-bit integer input types +//! - pack_u: for 16- and 32-bit signed integer input types +OPENCV_HAL_IMPL_C_PACK(v_uint16x8, v_uint8x16, uchar, pack) +OPENCV_HAL_IMPL_C_PACK(v_int16x8, v_int8x16, schar, pack) +OPENCV_HAL_IMPL_C_PACK(v_uint32x4, v_uint16x8, ushort, pack) +OPENCV_HAL_IMPL_C_PACK(v_int32x4, v_int16x8, short, pack) +OPENCV_HAL_IMPL_C_PACK(v_uint64x2, v_uint32x4, unsigned, pack) +OPENCV_HAL_IMPL_C_PACK(v_int64x2, v_int32x4, int, pack) +OPENCV_HAL_IMPL_C_PACK(v_int16x8, v_uint8x16, uchar, pack_u) +OPENCV_HAL_IMPL_C_PACK(v_int32x4, v_uint16x8, ushort, pack_u) +//! @} + +//! @brief Helper macro +//! @ingroup hal_intrin_impl +#define OPENCV_HAL_IMPL_C_RSHR_PACK(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix) \ template inline _Tpnvec v_rshr_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \ { \ _Tpnvec c; \ @@ -789,27 +1627,98 @@ template inline _Tpnvec v_rshr_##pack_suffix(const _Tpvec& a, const _Tpve c.s[i+_Tpvec::nlanes] = saturate_cast<_Tpn>((b.s[i] + ((_Tp)1 << (n - 1))) >> n); \ } \ return c; \ -} \ +} + +//! @name Pack with rounding shift +//! @{ +//! @brief Pack values from two vectors to one with rounding shift +//! +//! Values from the input vectors will be shifted right by _n_ bits with rounding, converted to narrower +//! type and returned in the result vector. Variant with _u_ suffix converts to unsigned type. +//! +//! - pack: for 16-, 32- and 64-bit integer input types +//! - pack_u: for 16- and 32-bit signed integer input types +OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint16x8, ushort, v_uint8x16, uchar, pack) +OPENCV_HAL_IMPL_C_RSHR_PACK(v_int16x8, short, v_int8x16, schar, pack) +OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint32x4, unsigned, v_uint16x8, ushort, pack) +OPENCV_HAL_IMPL_C_RSHR_PACK(v_int32x4, int, v_int16x8, short, pack) +OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint64x2, uint64, v_uint32x4, unsigned, pack) +OPENCV_HAL_IMPL_C_RSHR_PACK(v_int64x2, int64, v_int32x4, int, pack) +OPENCV_HAL_IMPL_C_RSHR_PACK(v_int16x8, short, v_uint8x16, uchar, pack_u) +OPENCV_HAL_IMPL_C_RSHR_PACK(v_int32x4, int, v_uint16x8, ushort, pack_u) +//! @} + +//! @brief Helper macro +//! @ingroup hal_intrin_impl +#define OPENCV_HAL_IMPL_C_PACK_STORE(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix) \ inline void v_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \ { \ for( int i = 0; i < _Tpvec::nlanes; i++ ) \ ptr[i] = saturate_cast<_Tpn>(a.s[i]); \ -} \ +} + +//! @name Pack and store +//! @{ +//! @brief Store values from the input vector into memory with pack +//! +//! Values will be stored into memory with saturating conversion to narrower type. +//! Variant with _u_ suffix converts to corresponding unsigned type. +//! +//! - pack: for 16-, 32- and 64-bit integer input types +//! - pack_u: for 16- and 32-bit signed integer input types +OPENCV_HAL_IMPL_C_PACK_STORE(v_uint16x8, ushort, v_uint8x16, uchar, pack) +OPENCV_HAL_IMPL_C_PACK_STORE(v_int16x8, short, v_int8x16, schar, pack) +OPENCV_HAL_IMPL_C_PACK_STORE(v_uint32x4, unsigned, v_uint16x8, ushort, pack) +OPENCV_HAL_IMPL_C_PACK_STORE(v_int32x4, int, v_int16x8, short, pack) +OPENCV_HAL_IMPL_C_PACK_STORE(v_uint64x2, uint64, v_uint32x4, unsigned, pack) +OPENCV_HAL_IMPL_C_PACK_STORE(v_int64x2, int64, v_int32x4, int, pack) +OPENCV_HAL_IMPL_C_PACK_STORE(v_int16x8, short, v_uint8x16, uchar, pack_u) +OPENCV_HAL_IMPL_C_PACK_STORE(v_int32x4, int, v_uint16x8, ushort, pack_u) +//! @} + +//! @brief Helper macro +//! @ingroup hal_intrin_impl +#define OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix) \ template inline void v_rshr_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \ { \ for( int i = 0; i < _Tpvec::nlanes; i++ ) \ ptr[i] = saturate_cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \ } -OPENCV_HAL_IMPL_C_PACK(v_uint16x8, ushort, v_uint8x16, uchar, pack) -OPENCV_HAL_IMPL_C_PACK(v_int16x8, short, v_int8x16, schar, pack) -OPENCV_HAL_IMPL_C_PACK(v_int16x8, short, v_uint8x16, uchar, pack_u) -OPENCV_HAL_IMPL_C_PACK(v_uint32x4, unsigned, v_uint16x8, ushort, pack) -OPENCV_HAL_IMPL_C_PACK(v_int32x4, int, v_int16x8, short, pack) -OPENCV_HAL_IMPL_C_PACK(v_int32x4, int, v_uint16x8, ushort, pack_u) -OPENCV_HAL_IMPL_C_PACK(v_uint64x2, uint64, v_uint32x4, unsigned, pack) -OPENCV_HAL_IMPL_C_PACK(v_int64x2, int64, v_int32x4, int, pack) +//! @name Pack and store with rounding shift +//! @{ +//! @brief Store values from the input vector into memory with pack +//! +//! Values will be shifted _n_ bits right with rounding, converted to narrower type and stored into +//! memory. Variant with _u_ suffix converts to unsigned type. +//! +//! - pack: for 16-, 32- and 64-bit integer input types +//! - pack_u: for 16- and 32-bit signed integer input types +OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint16x8, ushort, v_uint8x16, uchar, pack) +OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int16x8, short, v_int8x16, schar, pack) +OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint32x4, unsigned, v_uint16x8, ushort, pack) +OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int32x4, int, v_int16x8, short, pack) +OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint64x2, uint64, v_uint32x4, unsigned, pack) +OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int64x2, int64, v_int32x4, int, pack) +OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int16x8, short, v_uint8x16, uchar, pack_u) +OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int32x4, int, v_uint16x8, ushort, pack_u) +//! @} +/** @brief Matrix multiplication + +Scheme: +@code +{A0 A1 A2 A3} |V0| +{B0 B1 B2 B3} |V1| +{C0 C1 C2 C3} |V2| +{D0 D1 D2 D3} x |V3| +==================== +{R0 R1 R2 R3}, where: +R0 = A0V0 + A1V1 + A2V2 + A3V3, +R1 = B0V0 + B1V1 + B2V2 + B3V3 +... +@endcode +*/ inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0, const v_float32x4& m1, const v_float32x4& m2, const v_float32x4& m3) @@ -820,6 +1729,8 @@ inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0, v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + v.s[3]*m3.s[3]); } +//! @} + } #endif diff --git a/modules/hal/include/opencv2/hal/intrin_neon.hpp b/modules/hal/include/opencv2/hal/intrin_neon.hpp index e326696d6..d53971f96 100644 --- a/modules/hal/include/opencv2/hal/intrin_neon.hpp +++ b/modules/hal/include/opencv2/hal/intrin_neon.hpp @@ -48,6 +48,8 @@ namespace cv { +//! @cond IGNORED + #define CV_SIMD128 1 struct v_uint8x16 @@ -278,14 +280,15 @@ void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a) \ } OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_uint16x8, u16, pack, n) -OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_int16x8, s16, pack_u, un) OPENCV_HAL_IMPL_NEON_PACK(v_int8x16, schar, int8x8_t, s8, v_int16x8, s16, pack, n) OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_uint32x4, u32, pack, n) -OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_int32x4, s32, pack_u, un) OPENCV_HAL_IMPL_NEON_PACK(v_int16x8, short, int16x4_t, s16, v_int32x4, s32, pack, n) OPENCV_HAL_IMPL_NEON_PACK(v_uint32x4, unsigned, uint32x2_t, u32, v_uint64x2, u64, pack, n) OPENCV_HAL_IMPL_NEON_PACK(v_int32x4, int, int32x2_t, s32, v_int64x2, s64, pack, n) +OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_int16x8, s16, pack_u, un) +OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_int32x4, s32, pack_u, un) + inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0, const v_float32x4& m1, const v_float32x4& m2, const v_float32x4& m3) @@ -374,7 +377,7 @@ inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b) { int32x4_t c = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val)); int32x4_t d = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val)); - int32x4x2_t cd = vtrnq_s32(c, d); + int32x4x2_t cd = vuzpq_s32(c, d); return v_int32x4(vaddq_s32(cd.val[0], cd.val[1])); } @@ -497,6 +500,16 @@ OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_absdiff, vabdq_u16) OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_absdiff, vabdq_u32) OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_absdiff, vabdq_f32) +#define OPENCV_HAL_IMPL_NEON_BIN_FUNC2(_Tpvec, _Tpvec2, cast, func, intrin) \ +inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b) \ +{ \ + return _Tpvec2(cast(intrin(a.val, b.val))); \ +} + +OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int8x16, v_uint8x16, vreinterpretq_u8_s8, v_absdiff, vabdq_s8) +OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int16x8, v_uint16x8, vreinterpretq_u16_s16, v_absdiff, vabdq_s16) +OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int32x4, v_uint32x4, vreinterpretq_u32_s32, v_absdiff, vabdq_s32) + inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b) { v_float32x4 x(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val)); @@ -641,13 +654,13 @@ inline bool v_check_all(const v_float32x4& a) { return v_check_all(v_reinterpret_as_u32(a)); } inline bool v_check_any(const v_int8x16& a) -{ return v_check_all(v_reinterpret_as_u8(a)); } +{ return v_check_any(v_reinterpret_as_u8(a)); } inline bool v_check_any(const v_int16x8& a) -{ return v_check_all(v_reinterpret_as_u16(a)); } +{ return v_check_any(v_reinterpret_as_u16(a)); } inline bool v_check_any(const v_int32x4& a) -{ return v_check_all(v_reinterpret_as_u32(a)); } +{ return v_check_any(v_reinterpret_as_u32(a)); } inline bool v_check_any(const v_float32x4& a) -{ return v_check_all(v_reinterpret_as_u32(a)); } +{ return v_check_any(v_reinterpret_as_u32(a)); } #define OPENCV_HAL_IMPL_NEON_SELECT(_Tpvec, suffix, usuffix) \ inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \ @@ -678,6 +691,8 @@ OPENCV_HAL_IMPL_NEON_EXPAND(v_uint8x16, v_uint16x8, uchar, u8) OPENCV_HAL_IMPL_NEON_EXPAND(v_int8x16, v_int16x8, schar, s8) OPENCV_HAL_IMPL_NEON_EXPAND(v_uint16x8, v_uint32x4, ushort, u16) OPENCV_HAL_IMPL_NEON_EXPAND(v_int16x8, v_int32x4, short, s16) +OPENCV_HAL_IMPL_NEON_EXPAND(v_uint32x4, v_uint64x2, uint, u32) +OPENCV_HAL_IMPL_NEON_EXPAND(v_int32x4, v_int64x2, int, s32) inline v_uint32x4 v_load_expand_q(const uchar* ptr) { @@ -840,6 +855,8 @@ inline v_float32x4 v_cvt_f32(const v_int32x4& a) return v_float32x4(vcvtq_f32_s32(a.val)); } +//! @endcond + } #endif diff --git a/modules/hal/include/opencv2/hal/intrin_sse.hpp b/modules/hal/include/opencv2/hal/intrin_sse.hpp index 0c30f7d5b..e237ccd93 100644 --- a/modules/hal/include/opencv2/hal/intrin_sse.hpp +++ b/modules/hal/include/opencv2/hal/intrin_sse.hpp @@ -51,6 +51,8 @@ namespace cv { +//! @cond IGNORED + struct v_uint8x16 { typedef uchar lane_type; @@ -296,6 +298,11 @@ OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int32x4, s32) OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint64x2, u64) OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int64x2, s64) +inline v_float32x4 v_reinterpret_as_f32(const v_float32x4& a) {return a; } +inline v_float64x2 v_reinterpret_as_f64(const v_float64x2& a) {return a; } +inline v_float32x4 v_reinterpret_as_f32(const v_float64x2& a) {return v_float32x4(_mm_castpd_ps(a.val)); } +inline v_float64x2 v_reinterpret_as_f64(const v_float32x4& a) {return v_float64x2(_mm_castps_pd(a.val)); } + //////////////// PACK /////////////// inline v_uint8x16 v_pack(const v_uint16x8& a, const v_uint16x8& b) { @@ -430,6 +437,17 @@ inline void v_pack_u_store(ushort* ptr, const v_int32x4& a) _mm_storel_epi64((__m128i*)ptr, r); } +template inline +v_uint16x8 v_rshr_pack_u(const v_int32x4& a, const v_int32x4& b) +{ + __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768); + __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32); + __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768)); + __m128i b1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(b.val, delta), n), delta32); + __m128i b2 = _mm_sub_epi16(_mm_packs_epi32(b1, b1), _mm_set1_epi16(-32768)); + return v_uint16x8(_mm_unpacklo_epi64(a2, b2)); +} + template inline void v_rshr_pack_u_store(ushort* ptr, const v_int32x4& a) { @@ -460,7 +478,7 @@ void v_rshr_pack_store(short* ptr, const v_int32x4& a) { __m128i delta = _mm_set1_epi32(1 << (n-1)); __m128i a1 = _mm_srai_epi32(_mm_add_epi32(a.val, delta), n); - _mm_storel_epi64((__m128i*)ptr, a1); + _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a1, a1)); } @@ -469,7 +487,7 @@ inline v_uint32x4 v_pack(const v_uint64x2& a, const v_uint64x2& b) { __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0 __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0 - return v_uint32x4(_mm_unpacklo_epi64(v0, v1)); + return v_uint32x4(_mm_unpacklo_epi32(v0, v1)); } inline void v_pack_store(unsigned* ptr, const v_uint64x2& a) @@ -483,7 +501,7 @@ inline v_int32x4 v_pack(const v_int64x2& a, const v_int64x2& b) { __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0 __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0 - return v_int32x4(_mm_unpacklo_epi64(v0, v1)); + return v_int32x4(_mm_unpacklo_epi32(v0, v1)); } inline void v_pack_store(int* ptr, const v_int64x2& a) @@ -501,7 +519,7 @@ v_uint32x4 v_rshr_pack(const v_uint64x2& a, const v_uint64x2& b) __m128i b1 = _mm_srli_epi64(_mm_add_epi64(b.val, delta2.val), n); __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0 __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0 - return v_uint32x4(_mm_unpacklo_epi64(v0, v1)); + return v_uint32x4(_mm_unpacklo_epi32(v0, v1)); } template inline @@ -534,7 +552,7 @@ v_int32x4 v_rshr_pack(const v_int64x2& a, const v_int64x2& b) __m128i b1 = v_srai_epi64(_mm_add_epi64(b.val, delta2.val), n); __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0 __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0 - return v_int32x4(_mm_unpacklo_epi64(v0, v1)); + return v_int32x4(_mm_unpacklo_epi32(v0, v1)); } template inline @@ -630,8 +648,8 @@ inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b, { __m128i v0 = _mm_mullo_epi16(a.val, b.val); __m128i v1 = _mm_mulhi_epi16(a.val, b.val); - c.val = _mm_unpacklo_epi32(v0, v1); - d.val = _mm_unpackhi_epi32(v0, v1); + c.val = _mm_unpacklo_epi16(v0, v1); + d.val = _mm_unpackhi_epi16(v0, v1); } inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b, @@ -639,8 +657,8 @@ inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b, { __m128i v0 = _mm_mullo_epi16(a.val, b.val); __m128i v1 = _mm_mulhi_epu16(a.val, b.val); - c.val = _mm_unpacklo_epi32(v0, v1); - d.val = _mm_unpackhi_epi32(v0, v1); + c.val = _mm_unpacklo_epi16(v0, v1); + d.val = _mm_unpackhi_epi16(v0, v1); } inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b, @@ -869,6 +887,18 @@ inline _Tpuvec v_absdiff(const _Tpsvec& a, const _Tpsvec& b) \ OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint8x16, v_int8x16, 8, (int)0x80808080) OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint16x8, v_int16x8, 16, (int)0x80008000) +inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b) +{ + return v_max(a, b) - v_min(a, b); +} + +inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b) +{ + __m128i d = _mm_sub_epi32(a.val, b.val); + __m128i m = _mm_cmpgt_epi32(b.val, a.val); + return v_uint32x4(_mm_sub_epi32(_mm_xor_si128(d, m), m)); +} + #define OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(_Tpvec, _Tp, _Tpreg, suffix, absmask_vec) \ inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \ { \ @@ -1047,8 +1077,8 @@ OPENCV_HAL_IMPL_SSE_SELECT(v_uint16x8, si128) OPENCV_HAL_IMPL_SSE_SELECT(v_int16x8, si128) OPENCV_HAL_IMPL_SSE_SELECT(v_uint32x4, si128) OPENCV_HAL_IMPL_SSE_SELECT(v_int32x4, si128) -OPENCV_HAL_IMPL_SSE_SELECT(v_uint64x2, si128) -OPENCV_HAL_IMPL_SSE_SELECT(v_int64x2, si128) +// OPENCV_HAL_IMPL_SSE_SELECT(v_uint64x2, si128) +// OPENCV_HAL_IMPL_SSE_SELECT(v_int64x2, si128) OPENCV_HAL_IMPL_SSE_SELECT(v_float32x4, ps) OPENCV_HAL_IMPL_SSE_SELECT(v_float64x2, pd) @@ -1257,7 +1287,7 @@ inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 a8 b0 b8 ... __m128i v1 = _mm_unpackhi_epi8(u0, u2); // a2 a10 b2 b10 ... __m128i v2 = _mm_unpacklo_epi8(u1, u3); // a4 a12 b4 b12 ... - __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a6 a14 b4 b14 ... + __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a6 a14 b6 b14 ... u0 = _mm_unpacklo_epi8(v0, v2); // a0 a4 a8 a12 ... u1 = _mm_unpacklo_epi8(v1, v3); // a2 a6 a10 a14 ... @@ -1266,13 +1296,13 @@ inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v0 = _mm_unpacklo_epi8(u0, u1); // a0 a2 a4 a6 ... v1 = _mm_unpacklo_epi8(u2, u3); // a1 a3 a5 a7 ... - v2 = _mm_unpackhi_epi8(u0, u1); // b0 b2 b4 b6 ... - v3 = _mm_unpackhi_epi8(u2, u3); // b1 b3 b5 b7 ... + v2 = _mm_unpackhi_epi8(u0, u1); // c0 c2 c4 c6 ... + v3 = _mm_unpackhi_epi8(u2, u3); // c1 c3 c5 c7 ... a.val = _mm_unpacklo_epi8(v0, v1); - b.val = _mm_unpacklo_epi8(v2, v3); - c.val = _mm_unpackhi_epi8(v0, v1); - d.val = _mm_unpacklo_epi8(v2, v3); + b.val = _mm_unpackhi_epi8(v0, v1); + c.val = _mm_unpacklo_epi8(v2, v3); + d.val = _mm_unpackhi_epi8(v2, v3); } inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c) @@ -1560,6 +1590,8 @@ inline v_float64x2 v_cvt_f64(const v_float32x4& a) return v_float64x2(_mm_cvtps_pd(a.val)); } +//! @endcond + } #endif diff --git a/modules/hal/test/test_intrin.cpp b/modules/hal/test/test_intrin.cpp new file mode 100644 index 000000000..ae043dd20 --- /dev/null +++ b/modules/hal/test/test_intrin.cpp @@ -0,0 +1,864 @@ +#include "test_intrin_utils.hpp" +#include + +using namespace cv; + +template struct TheTest +{ + typedef typename R::lane_type LaneType; + + TheTest & test_loadstore() + { + AlignedData data; + AlignedData out; + + // check if addresses are aligned and unaligned respectively + EXPECT_EQ((size_t)0, (size_t)&data.a.d % 16); + EXPECT_NE((size_t)0, (size_t)&data.u.d % 16); + EXPECT_EQ((size_t)0, (size_t)&out.a.d % 16); + EXPECT_NE((size_t)0, (size_t)&out.u.d % 16); + + // check some initialization methods + R r1 = data.a; + R r2 = v_load(data.u.d); + R r3 = v_load_aligned(data.a.d); + R r4(r2); + EXPECT_EQ(data.a[0], r1.get0()); + EXPECT_EQ(data.u[0], r2.get0()); + EXPECT_EQ(data.a[0], r3.get0()); + EXPECT_EQ(data.u[0], r4.get0()); + + // check some store methods + out.u.clear(); + out.a.clear(); + v_store(out.u.d, r1); + v_store_aligned(out.a.d, r2); + EXPECT_EQ(data.a, out.a); + EXPECT_EQ(data.u, out.u); + + // check more store methods + Data d, res(0); + R r5 = d; + v_store_high(res.mid(), r5); + v_store_low(res.d, r5); + EXPECT_EQ(d, res); + + // check halves load correctness + res.clear(); + R r6 = v_load_halves(d.d, d.mid()); + v_store(res.d, r6); + EXPECT_EQ(d, res); + + // zero, all + Data resZ = RegTrait::zero(); + Data resV = RegTrait::all(8); + for (int i = 0; i < R::nlanes; ++i) + { + EXPECT_EQ((LaneType)0, resZ[i]); + EXPECT_EQ((LaneType)8, resV[i]); + } + + // reinterpret_as + v_uint8x16 vu8 = v_reinterpret_as_u8(r1); out.a.clear(); v_store((uchar*)out.a.d, vu8); EXPECT_EQ(data.a, out.a); + v_int8x16 vs8 = v_reinterpret_as_s8(r1); out.a.clear(); v_store((schar*)out.a.d, vs8); EXPECT_EQ(data.a, out.a); + v_uint16x8 vu16 = v_reinterpret_as_u16(r1); out.a.clear(); v_store((ushort*)out.a.d, vu16); EXPECT_EQ(data.a, out.a); + v_int16x8 vs16 = v_reinterpret_as_s16(r1); out.a.clear(); v_store((short*)out.a.d, vs16); EXPECT_EQ(data.a, out.a); + v_uint32x4 vu32 = v_reinterpret_as_u32(r1); out.a.clear(); v_store((unsigned*)out.a.d, vu32); EXPECT_EQ(data.a, out.a); + v_int32x4 vs32 = v_reinterpret_as_s32(r1); out.a.clear(); v_store((int*)out.a.d, vs32); EXPECT_EQ(data.a, out.a); + v_uint64x2 vu64 = v_reinterpret_as_u64(r1); out.a.clear(); v_store((uint64*)out.a.d, vu64); EXPECT_EQ(data.a, out.a); + v_int64x2 vs64 = v_reinterpret_as_s64(r1); out.a.clear(); v_store((int64*)out.a.d, vs64); EXPECT_EQ(data.a, out.a); + v_float32x4 vf32 = v_reinterpret_as_f32(r1); out.a.clear(); v_store((float*)out.a.d, vf32); EXPECT_EQ(data.a, out.a); +#if CV_SIMD128_64F + v_float64x2 vf64 = v_reinterpret_as_f64(r1); out.a.clear(); v_store((double*)out.a.d, vf64); EXPECT_EQ(data.a, out.a); +#endif + + return *this; + } + + TheTest & test_interleave() + { + Data data1, data2, data3, data4; + data2 += 20; + data3 += 40; + data4 += 60; + + + R a = data1, b = data2, c = data3; + R d = data1, e = data2, f = data3, g = data4; + + LaneType buf3[R::nlanes * 3]; + LaneType buf4[R::nlanes * 4]; + + v_store_interleave(buf3, a, b, c); + v_store_interleave(buf4, d, e, f, g); + + Data z(0); + a = b = c = d = e = f = g = z; + + v_load_deinterleave(buf3, a, b, c); + v_load_deinterleave(buf4, d, e, f, g); + + for (int i = 0; i < R::nlanes; ++i) + { + EXPECT_EQ(data1, Data(a)); + EXPECT_EQ(data2, Data(b)); + EXPECT_EQ(data3, Data(c)); + + EXPECT_EQ(data1, Data(d)); + EXPECT_EQ(data2, Data(e)); + EXPECT_EQ(data3, Data(f)); + EXPECT_EQ(data4, Data(g)); + } + + return *this; + } + + // v_expand and v_load_expand + TheTest & test_expand() + { + typedef typename RegTrait::w_reg Rx2; + Data dataA; + R a = dataA; + + Data resB = v_load_expand(dataA.d); + + Rx2 c, d; + v_expand(a, c, d); + + Data resC = c, resD = d; + const int n = Rx2::nlanes; + for (int i = 0; i < n; ++i) + { + EXPECT_EQ(dataA[i], resB[i]); + EXPECT_EQ(dataA[i], resC[i]); + EXPECT_EQ(dataA[i + n], resD[i]); + } + + return *this; + } + + TheTest & test_expand_q() + { + typedef typename RegTrait::q_reg Rx4; + Data data; + Data out = v_load_expand_q(data.d); + const int n = Rx4::nlanes; + for (int i = 0; i < n; ++i) + EXPECT_EQ(data[i], out[i]); + + return *this; + } + + TheTest & test_addsub() + { + Data dataA, dataB; + dataB.reverse(); + R a = dataA, b = dataB; + + Data resC = a + b, resD = a - b; + for (int i = 0; i < R::nlanes; ++i) + { + EXPECT_EQ(saturate_cast(dataA[i] + dataB[i]), resC[i]); + EXPECT_EQ(saturate_cast(dataA[i] - dataB[i]), resD[i]); + } + + return *this; + } + + TheTest & test_addsub_wrap() + { + Data dataA, dataB; + dataB.reverse(); + R a = dataA, b = dataB; + + Data resC = v_add_wrap(a, b), + resD = v_sub_wrap(a, b); + for (int i = 0; i < R::nlanes; ++i) + { + EXPECT_EQ((LaneType)(dataA[i] + dataB[i]), resC[i]); + EXPECT_EQ((LaneType)(dataA[i] - dataB[i]), resD[i]); + } + return *this; + } + + TheTest & test_mul() + { + Data dataA, dataB; + dataB.reverse(); + R a = dataA, b = dataB; + + Data resC = a * b; + for (int i = 0; i < R::nlanes; ++i) + { + EXPECT_EQ(dataA[i] * dataB[i], resC[i]); + } + + return *this; + } + + TheTest & test_div() + { + Data dataA, dataB; + dataB.reverse(); + R a = dataA, b = dataB; + + Data resC = a / b; + for (int i = 0; i < R::nlanes; ++i) + { + EXPECT_EQ(dataA[i] / dataB[i], resC[i]); + } + + return *this; + } + + TheTest & test_mul_expand() + { + typedef typename RegTrait::w_reg Rx2; + Data dataA, dataB(2); + R a = dataA, b = dataB; + Rx2 c, d; + + v_mul_expand(a, b, c, d); + + Data resC = c, resD = d; + const int n = R::nlanes / 2; + for (int i = 0; i < n; ++i) + { + EXPECT_EQ((typename Rx2::lane_type)dataA[i] * dataB[i], resC[i]); + EXPECT_EQ((typename Rx2::lane_type)dataA[i + n] * dataB[i + n], resD[i]); + } + + return *this; + } + + template + TheTest & test_shift() + { + Data dataA; + R a = dataA; + + Data resB = a << s, resC = v_shl(a), resD = a >> s, resE = v_shr(a); + for (int i = 0; i < R::nlanes; ++i) + { + EXPECT_EQ(dataA[i] << s, resB[i]); + EXPECT_EQ(dataA[i] << s, resC[i]); + EXPECT_EQ(dataA[i] >> s, resD[i]); + EXPECT_EQ(dataA[i] >> s, resE[i]); + } + return *this; + } + + TheTest & test_cmp() + { + Data dataA, dataB; + dataB.reverse(); + dataB += 1; + R a = dataA, b = dataB; + + Data resC = (a == b); + Data resD = (a != b); + Data resE = (a > b); + Data resF = (a >= b); + Data resG = (a < b); + Data resH = (a <= b); + + for (int i = 0; i < R::nlanes; ++i) + { + EXPECT_EQ(dataA[i] == dataB[i], resC[i] != 0); + EXPECT_EQ(dataA[i] != dataB[i], resD[i] != 0); + EXPECT_EQ(dataA[i] > dataB[i], resE[i] != 0); + EXPECT_EQ(dataA[i] >= dataB[i], resF[i] != 0); + EXPECT_EQ(dataA[i] < dataB[i], resG[i] != 0); + EXPECT_EQ(dataA[i] <= dataB[i], resH[i] != 0); + } + return *this; + } + + TheTest & test_dot_prod() + { + typedef typename RegTrait::w_reg Rx2; + Data dataA, dataB(2); + R a = dataA, b = dataB; + + Data res = v_dotprod(a, b); + + const int n = R::nlanes / 2; + for (int i = 0; i < n; ++i) + { + EXPECT_EQ(dataA[i*2] * dataB[i*2] + dataA[i*2 + 1] * dataB[i*2 + 1], res[i]); + } + return *this; + } + + TheTest & test_logic() + { + Data dataA, dataB(2); + R a = dataA, b = dataB; + + Data resC = a & b, resD = a | b, resE = a ^ b, resF = ~a; + for (int i = 0; i < R::nlanes; ++i) + { + EXPECT_EQ(dataA[i] & dataB[i], resC[i]); + EXPECT_EQ(dataA[i] | dataB[i], resD[i]); + EXPECT_EQ(dataA[i] ^ dataB[i], resE[i]); + EXPECT_EQ((LaneType)~dataA[i], resF[i]); + } + + return *this; + } + + TheTest & test_sqrt_abs() + { + Data dataA, dataD; + dataD *= -1.0; + R a = dataA, d = dataD; + + Data resB = v_sqrt(a), resC = v_invsqrt(a), resE = v_abs(d); + for (int i = 0; i < R::nlanes; ++i) + { + EXPECT_FLOAT_EQ((float)std::sqrt(dataA[i]), (float)resB[i]); + EXPECT_FLOAT_EQ((float)1/std::sqrt(dataA[i]), (float)resC[i]); + EXPECT_FLOAT_EQ((float)abs(dataA[i]), (float)resE[i]); + } + + return *this; + } + + TheTest & test_min_max() + { + Data dataA, dataB; + dataB.reverse(); + R a = dataA, b = dataB; + + Data resC = v_min(a, b), resD = v_max(a, b); + for (int i = 0; i < R::nlanes; ++i) + { + EXPECT_EQ(std::min(dataA[i], dataB[i]), resC[i]); + EXPECT_EQ(std::max(dataA[i], dataB[i]), resD[i]); + } + + return *this; + } + + TheTest & test_absdiff() + { + typedef typename RegTrait::u_reg Ru; + typedef typename Ru::lane_type u_type; + Data dataA(std::numeric_limits::max()), + dataB(std::numeric_limits::min()); + dataA[0] = (LaneType)-1; + dataB[0] = 1; + dataA[1] = 2; + dataB[1] = (LaneType)-2; + R a = dataA, b = dataB; + Data resC = v_absdiff(a, b); + const u_type mask = std::numeric_limits::is_signed ? (u_type)(1 << (sizeof(u_type)*8 - 1)) : 0; + for (int i = 0; i < Ru::nlanes; ++i) + { + u_type uA = dataA[i] ^ mask; + u_type uB = dataB[i] ^ mask; + EXPECT_EQ(uA > uB ? uA - uB : uB - uA, resC[i]); + } + return *this; + } + + TheTest & test_float_absdiff() + { + Data dataA(std::numeric_limits::max()), + dataB(std::numeric_limits::min()); + dataA[0] = -1; + dataB[0] = 1; + dataA[1] = 2; + dataB[1] = -2; + R a = dataA, b = dataB; + Data resC = v_absdiff(a, b); + for (int i = 0; i < R::nlanes; ++i) + { + EXPECT_EQ(dataA[i] > dataB[i] ? dataA[i] - dataB[i] : dataB[i] - dataA[i], resC[i]); + } + return *this; + } + + TheTest & test_reduce() + { + Data dataA; + R a = dataA; + EXPECT_EQ((LaneType)1, v_reduce_min(a)); + EXPECT_EQ((LaneType)R::nlanes, v_reduce_max(a)); + EXPECT_EQ((LaneType)(1 + R::nlanes)*2, v_reduce_sum(a)); + return *this; + } + + TheTest & test_mask() + { + Data dataA, dataB, dataC, dataD(1), dataE(2); + dataA[1] *= (LaneType)-1; + dataC *= (LaneType)-1; + R a = dataA, b = dataB, c = dataC, d = dataD, e = dataE; + + int m = v_signmask(a); + EXPECT_EQ(2, m); + + EXPECT_EQ(false, v_check_all(a)); + EXPECT_EQ(false, v_check_all(b)); + EXPECT_EQ(true, v_check_all(c)); + + EXPECT_EQ(true, v_check_any(a)); + EXPECT_EQ(false, v_check_any(b)); + EXPECT_EQ(true, v_check_any(c)); + + typedef V_TypeTraits Traits; + typedef typename Traits::int_type int_type; + + R f = v_select(b, d, e); + Data resF = f; + for (int i = 0; i < R::nlanes; ++i) + { + int_type m2 = Traits::reinterpret_int(dataB[i]); + EXPECT_EQ((Traits::reinterpret_int(dataD[i]) & m2) + | (Traits::reinterpret_int(dataE[i]) & ~m2), + Traits::reinterpret_int(resF[i])); + } + + return *this; + } + + template + TheTest & test_pack() + { + typedef typename RegTrait::w_reg Rx2; + typedef typename Rx2::lane_type w_type; + Data dataA, dataB; + dataA += std::numeric_limits::is_signed ? -10 : 10; + dataB *= 10; + Rx2 a = dataA, b = dataB; + + Data resC = v_pack(a, b); + Data resD = v_rshr_pack(a, b); + + Data resE(0); + v_pack_store(resE.d, b); + + Data resF(0); + v_rshr_pack_store(resF.d, b); + + const int n = Rx2::nlanes; + const w_type add = (w_type)1 << (s - 1); + for (int i = 0; i < n; ++i) + { + EXPECT_EQ(saturate_cast(dataA[i]), resC[i]); + EXPECT_EQ(saturate_cast(dataB[i]), resC[i + n]); + EXPECT_EQ(saturate_cast((dataA[i] + add) >> s), resD[i]); + EXPECT_EQ(saturate_cast((dataB[i] + add) >> s), resD[i + n]); + EXPECT_EQ(saturate_cast(dataB[i]), resE[i]); + EXPECT_EQ((LaneType)0, resE[i + n]); + EXPECT_EQ(saturate_cast((dataB[i] + add) >> s), resF[i]); + EXPECT_EQ((LaneType)0, resF[i + n]); + } + return *this; + } + + template + TheTest & test_pack_u() + { + typedef typename RegTrait::w_reg Rx2; + typedef typename RegTrait::int_reg Ri2; + typedef typename Ri2::lane_type w_type; + + Data dataA, dataB; + dataA += -10; + dataB *= 10; + Ri2 a = dataA, b = dataB; + + Data resC = v_pack_u(a, b); + Data resD = v_rshr_pack_u(a, b); + + Data resE(0); + v_pack_u_store(resE.d, b); + + Data resF(0); + v_rshr_pack_u_store(resF.d, b); + + const int n = Ri2::nlanes; + const w_type add = (w_type)1 << (s - 1); + for (int i = 0; i < n; ++i) + { + EXPECT_EQ(saturate_cast(dataA[i]), resC[i]); + EXPECT_EQ(saturate_cast(dataB[i]), resC[i + n]); + EXPECT_EQ(saturate_cast((dataA[i] + add) >> s), resD[i]); + EXPECT_EQ(saturate_cast((dataB[i] + add) >> s), resD[i + n]); + EXPECT_EQ(saturate_cast(dataB[i]), resE[i]); + EXPECT_EQ((LaneType)0, resE[i + n]); + EXPECT_EQ(saturate_cast((dataB[i] + add) >> s), resF[i]); + EXPECT_EQ((LaneType)0, resF[i + n]); + } + return *this; + } + + TheTest & test_unpack() + { + Data dataA, dataB; + dataB *= 10; + R a = dataA, b = dataB; + + R c, d, e, f, lo, hi; + v_zip(a, b, c, d); + v_recombine(a, b, e, f); + lo = v_combine_low(a, b); + hi = v_combine_high(a, b); + + Data resC = c, resD = d, resE = e, resF = f, resLo = lo, resHi = hi; + + const int n = R::nlanes/2; + for (int i = 0; i < n; ++i) + { + EXPECT_EQ(dataA[i], resC[i*2]); + EXPECT_EQ(dataB[i], resC[i*2+1]); + EXPECT_EQ(dataA[i+n], resD[i*2]); + EXPECT_EQ(dataB[i+n], resD[i*2+1]); + + EXPECT_EQ(dataA[i], resE[i]); + EXPECT_EQ(dataB[i], resE[i+n]); + EXPECT_EQ(dataA[i+n], resF[i]); + EXPECT_EQ(dataB[i+n], resF[i+n]); + + EXPECT_EQ(dataA[i], resLo[i]); + EXPECT_EQ(dataB[i], resLo[i+n]); + EXPECT_EQ(dataA[i+n], resHi[i]); + EXPECT_EQ(dataB[i+n], resHi[i+n]); + } + + return *this; + } + + template + TheTest & test_extract() + { + Data dataA, dataB; + dataB *= 10; + R a = dataA, b = dataB; + + Data resC = v_extract(a, b); + + for (int i = 0; i < R::nlanes; ++i) + { + if (i + s >= R::nlanes) + EXPECT_EQ(dataB[i - R::nlanes + s], resC[i]); + else + EXPECT_EQ(dataA[i + s], resC[i]); + } + + return *this; + } + + TheTest & test_float_math() + { + typedef typename RegTrait::int_reg Ri; + Data data1, data2, data3; + data1 *= 1.1; + data2 += 10; + R a1 = data1, a2 = data2, a3 = data3; + + Data resB = v_round(a1), + resC = v_trunc(a1), + resD = v_floor(a1), + resE = v_ceil(a1); + + Data resF = v_magnitude(a1, a2), + resG = v_sqr_magnitude(a1, a2), + resH = v_muladd(a1, a2, a3); + + for (int i = 0; i < R::nlanes; ++i) + { + EXPECT_EQ(cvRound(data1[i]), resB[i]); + EXPECT_EQ((typename Ri::lane_type)data1[i], resC[i]); + EXPECT_EQ(cvFloor(data1[i]), resD[i]); + EXPECT_EQ(cvCeil(data1[i]), resE[i]); + + EXPECT_DOUBLE_EQ(std::sqrt(data1[i]*data1[i] + data2[i]*data2[i]), resF[i]); + EXPECT_DOUBLE_EQ(data1[i]*data1[i] + data2[i]*data2[i], resG[i]); + EXPECT_DOUBLE_EQ(data1[i]*data2[i] + data3[i], resH[i]); + } + + return *this; + } + + TheTest & test_float_cvt32() + { + typedef v_float32x4 Rt; + Data dataA; + dataA *= 1.1; + R a = dataA; + Rt b = v_cvt_f32(a); + Data resB = b; + int n = std::min(Rt::nlanes, R::nlanes); + for (int i = 0; i < n; ++i) + { + EXPECT_EQ((typename Rt::lane_type)dataA[i], resB[i]); + } + return *this; + } + + TheTest & test_float_cvt64() + { +#if CV_SIMD128_64F + typedef v_float64x2 Rt; + Data dataA; + dataA *= 1.1; + R a = dataA; + Rt b = v_cvt_f64(a); + Data resB = b; + int n = std::min(Rt::nlanes, R::nlanes); + for (int i = 0; i < n; ++i) + { + EXPECT_EQ((typename Rt::lane_type)dataA[i], resB[i]); + } +#endif + return *this; + } + + TheTest & test_matmul() + { + Data dataV, dataA, dataB, dataC, dataD; + dataB.reverse(); + dataC += 2; + dataD *= 0.3; + R v = dataV, a = dataA, b = dataB, c = dataC, d = dataD; + + Data res = v_matmul(v, a, b, c, d); + for (int i = 0; i < R::nlanes; ++i) + { + LaneType val = dataV[0] * dataA[i] + + dataV[1] * dataB[i] + + dataV[2] * dataC[i] + + dataV[3] * dataD[i]; + EXPECT_DOUBLE_EQ(val, res[i]); + } + return *this; + } + + TheTest & test_transpose() + { + Data dataA, dataB, dataC, dataD; + dataB *= 5; + dataC *= 10; + dataD *= 15; + R a = dataA, b = dataB, c = dataC, d = dataD; + R e, f, g, h; + v_transpose4x4(a, b, c, d, + e, f, g, h); + + Data res[4] = {e, f, g, h}; + for (int i = 0; i < R::nlanes; ++i) + { + EXPECT_EQ(dataA[i], res[i][0]); + EXPECT_EQ(dataB[i], res[i][1]); + EXPECT_EQ(dataC[i], res[i][2]); + EXPECT_EQ(dataD[i], res[i][3]); + } + return *this; + } + +}; + + +//============= 8-bit integer ===================================================================== + +TEST(hal_intrin, uint8x16) { + TheTest() + .test_loadstore() + .test_interleave() + .test_expand() + .test_expand_q() + .test_addsub() + .test_addsub_wrap() + .test_cmp() + .test_logic() + .test_min_max() + .test_absdiff() + .test_mask() + .test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>() + .test_pack_u<1>().test_pack_u<2>().test_pack_u<3>().test_pack_u<8>() + .test_unpack() + .test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>() + ; +} + +TEST(hal_intrin, int8x16) { + TheTest() + .test_loadstore() + .test_interleave() + .test_expand() + .test_expand_q() + .test_addsub() + .test_addsub_wrap() + .test_cmp() + .test_logic() + .test_min_max() + .test_absdiff() + .test_mask() + .test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>() + .test_unpack() + .test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>() + ; +} + +//============= 16-bit integer ===================================================================== + +TEST(hal_intrin, uint16x8) { + TheTest() + .test_loadstore() + .test_interleave() + .test_expand() + .test_addsub() + .test_addsub_wrap() + .test_mul() + .test_mul_expand() + .test_cmp() + .test_shift<1>() + .test_shift<8>() + .test_logic() + .test_min_max() + .test_absdiff() + .test_mask() + .test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>() + .test_pack_u<1>().test_pack_u<2>().test_pack_u<7>().test_pack_u<16>() + .test_unpack() + .test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>() + ; +} + +TEST(hal_intrin, int16x8) { + TheTest() + .test_loadstore() + .test_interleave() + .test_expand() + .test_addsub() + .test_addsub_wrap() + .test_mul() + .test_mul_expand() + .test_cmp() + .test_shift<1>() + .test_shift<8>() + .test_dot_prod() + .test_logic() + .test_min_max() + .test_absdiff() + .test_mask() + .test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>() + .test_unpack() + .test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>() + ; +} + +//============= 32-bit integer ===================================================================== + +TEST(hal_intrin, uint32x4) { + TheTest() + .test_loadstore() + .test_interleave() + .test_expand() + .test_addsub() + .test_mul() + .test_mul_expand() + .test_cmp() + .test_shift<1>() + .test_shift<8>() + .test_logic() + .test_min_max() + .test_absdiff() + .test_reduce() + .test_mask() + .test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>() + .test_unpack() + .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>() + .test_transpose() + ; +} + +TEST(hal_intrin, int32x4) { + TheTest() + .test_loadstore() + .test_interleave() + .test_expand() + .test_addsub() + .test_mul() + .test_cmp() + .test_shift<1>().test_shift<8>() + .test_logic() + .test_min_max() + .test_absdiff() + .test_reduce() + .test_mask() + .test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>() + .test_unpack() + .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>() + .test_float_cvt32() + .test_float_cvt64() + .test_transpose() + ; +} + +//============= 64-bit integer ===================================================================== + +TEST(hal_intrin, uint64x2) { + TheTest() + .test_loadstore() + .test_addsub() + .test_shift<1>().test_shift<8>() + .test_logic() + .test_extract<0>().test_extract<1>() + ; +} + +TEST(hal_intrin, int64x2) { + TheTest() + .test_loadstore() + .test_addsub() + .test_shift<1>().test_shift<8>() + .test_logic() + .test_extract<0>().test_extract<1>() + ; +} + +//============= Floating point ===================================================================== + +TEST(hal_intrin, float32x4) { + TheTest() + .test_loadstore() + .test_interleave() + .test_addsub() + .test_mul() + .test_div() + .test_cmp() + .test_sqrt_abs() + .test_min_max() + .test_float_absdiff() + .test_reduce() + .test_mask() + .test_unpack() + .test_float_math() + .test_float_cvt64() + .test_matmul() + .test_transpose() + ; +} + +#if CV_SIMD128_64F +TEST(hal_intrin, float64x2) { + TheTest() + .test_loadstore() + .test_addsub() + .test_mul() + .test_div() + .test_cmp() + .test_sqrt_abs() + .test_min_max() + .test_float_absdiff() + .test_mask() + .test_unpack() + .test_float_math() + .test_float_cvt32() + ; +} +#endif diff --git a/modules/hal/test/test_intrin_utils.hpp b/modules/hal/test/test_intrin_utils.hpp new file mode 100644 index 000000000..47473ae46 --- /dev/null +++ b/modules/hal/test/test_intrin_utils.hpp @@ -0,0 +1,234 @@ +#ifndef _TEST_UTILS_HPP_ +#define _TEST_UTILS_HPP_ + +#include "opencv2/hal/intrin.hpp" +#include "opencv2/ts.hpp" +#include +#include + +template struct Data; +template struct initializer; + +template <> struct initializer<16> +{ + template static R init(const Data & d) + { + return R(d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7], d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15]); + } +}; + +template <> struct initializer<8> +{ + template static R init(const Data & d) + { + return R(d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7]); + } +}; + +template <> struct initializer<4> +{ + template static R init(const Data & d) + { + return R(d[0], d[1], d[2], d[3]); + } +}; + +template <> struct initializer<2> +{ + template static R init(const Data & d) + { + return R(d[0], d[1]); + } +}; + +//================================================================================================== + +template struct Data +{ + typedef typename R::lane_type LaneType; + Data() + { + for (int i = 0; i < R::nlanes; ++i) + d[i] = (LaneType)(i + 1); + } + Data(LaneType val) + { + fill(val); + } + Data(const R & r) + { + *this = r; + } + operator R () + { + return initializer().init(*this); + } + Data & operator=(const R & r) + { + v_store(d, r); + return *this; + } + template Data & operator*=(T m) + { + for (int i = 0; i < R::nlanes; ++i) + d[i] *= (LaneType)m; + return *this; + } + template Data & operator+=(T m) + { + for (int i = 0; i < R::nlanes; ++i) + d[i] += (LaneType)m; + return *this; + } + void fill(LaneType val) + { + for (int i = 0; i < R::nlanes; ++i) + d[i] = val; + } + void reverse() + { + for (int i = 0; i < R::nlanes / 2; ++i) + std::swap(d[i], d[R::nlanes - i - 1]); + } + const LaneType & operator[](int i) const + { + CV_Assert(i >= 0 && i < R::nlanes); + return d[i]; + } + LaneType & operator[](int i) + { + CV_Assert(i >= 0 && i < R::nlanes); + return d[i]; + } + const LaneType * mid() const + { + return d + R::nlanes / 2; + } + LaneType * mid() + { + return d + R::nlanes / 2; + } + bool operator==(const Data & other) const + { + for (int i = 0; i < R::nlanes; ++i) + if (d[i] != other.d[i]) + return false; + return true; + } + void clear() + { + fill(0); + } + bool isZero() const + { + return isValue(0); + } + bool isValue(uchar val) const + { + for (int i = 0; i < R::nlanes; ++i) + if (d[i] != val) + return false; + return true; + } + + LaneType d[R::nlanes]; +}; + +template struct AlignedData +{ + Data CV_DECL_ALIGNED(16) a; // aligned + char dummy; + Data u; // unaligned +}; + +template std::ostream & operator<<(std::ostream & out, const Data & d) +{ + out << "{ "; + for (int i = 0; i < R::nlanes; ++i) + { + // out << std::hex << +V_TypeTraits::reinterpret_int(d.d[i]); + out << +d.d[i]; + if (i + 1 < R::nlanes) + out << ", "; + } + out << " }"; + return out; +} + +//================================================================================================== + +template struct RegTrait; + +template <> struct RegTrait { + typedef cv::v_uint16x8 w_reg; + typedef cv::v_uint32x4 q_reg; + typedef cv::v_uint8x16 u_reg; + static cv::v_uint8x16 zero() { return cv::v_setzero_u8(); } + static cv::v_uint8x16 all(uchar val) { return cv::v_setall_u8(val); } +}; +template <> struct RegTrait { + typedef cv::v_int16x8 w_reg; + typedef cv::v_int32x4 q_reg; + typedef cv::v_uint8x16 u_reg; + static cv::v_int8x16 zero() { return cv::v_setzero_s8(); } + static cv::v_int8x16 all(schar val) { return cv::v_setall_s8(val); } +}; + +template <> struct RegTrait { + typedef cv::v_uint32x4 w_reg; + typedef cv::v_int16x8 int_reg; + typedef cv::v_uint16x8 u_reg; + static cv::v_uint16x8 zero() { return cv::v_setzero_u16(); } + static cv::v_uint16x8 all(ushort val) { return cv::v_setall_u16(val); } +}; + +template <> struct RegTrait { + typedef cv::v_int32x4 w_reg; + typedef cv::v_uint16x8 u_reg; + static cv::v_int16x8 zero() { return cv::v_setzero_s16(); } + static cv::v_int16x8 all(short val) { return cv::v_setall_s16(val); } +}; + +template <> struct RegTrait { + typedef cv::v_uint64x2 w_reg; + typedef cv::v_int32x4 int_reg; + typedef cv::v_uint32x4 u_reg; + static cv::v_uint32x4 zero() { return cv::v_setzero_u32(); } + static cv::v_uint32x4 all(unsigned val) { return cv::v_setall_u32(val); } +}; + +template <> struct RegTrait { + typedef cv::v_int64x2 w_reg; + typedef cv::v_uint32x4 u_reg; + static cv::v_int32x4 zero() { return cv::v_setzero_s32(); } + static cv::v_int32x4 all(int val) { return cv::v_setall_s32(val); } +}; + +template <> struct RegTrait { + static cv::v_uint64x2 zero() { return cv::v_setzero_u64(); } + static cv::v_uint64x2 all(uint64 val) { return cv::v_setall_u64(val); } +}; + +template <> struct RegTrait { + static cv::v_int64x2 zero() { return cv::v_setzero_s64(); } + static cv::v_int64x2 all(int64 val) { return cv::v_setall_s64(val); } +}; + +template <> struct RegTrait { + typedef cv::v_int32x4 int_reg; + typedef cv::v_float32x4 u_reg; + static cv::v_float32x4 zero() { return cv::v_setzero_f32(); } + static cv::v_float32x4 all(float val) { return cv::v_setall_f32(val); } +}; + +#if CV_SIMD128_64F +template <> struct RegTrait { + typedef cv::v_int32x4 int_reg; + typedef cv::v_float64x2 u_reg; + static cv::v_float64x2 zero() { return cv::v_setzero_f64(); } + static cv::v_float64x2 all(double val) { return cv::v_setall_f64(val); } +}; + +#endif + +#endif diff --git a/modules/hal/test/test_main.cpp b/modules/hal/test/test_main.cpp new file mode 100644 index 000000000..d337a5ba7 --- /dev/null +++ b/modules/hal/test/test_main.cpp @@ -0,0 +1,3 @@ +#include "opencv2/ts.hpp" + +CV_TEST_MAIN("cv") diff --git a/modules/hal/test/test_precomp.hpp b/modules/hal/test/test_precomp.hpp new file mode 100644 index 000000000..387b7ba2b --- /dev/null +++ b/modules/hal/test/test_precomp.hpp @@ -0,0 +1,11 @@ +#ifndef __OPENCV_HAL_TEST_PRECOMP_HPP__ +#define __OPENCV_HAL_TEST_PRECOMP_HPP__ + +#include +#include +#include "opencv2/ts.hpp" +#include "opencv2/hal.hpp" +#include "opencv2/hal/defs.h" +#include "opencv2/hal/intrin.hpp" + +#endif From 603864dba282fe5d497c4a1b39ec5aac2cc4aebb Mon Sep 17 00:00:00 2001 From: Maksim Shabunin Date: Mon, 14 Sep 2015 12:15:56 +0300 Subject: [PATCH 2/2] Warning fix --- modules/hal/test/test_intrin.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/hal/test/test_intrin.cpp b/modules/hal/test/test_intrin.cpp index ae043dd20..681fe3fd0 100644 --- a/modules/hal/test/test_intrin.cpp +++ b/modules/hal/test/test_intrin.cpp @@ -317,7 +317,7 @@ template struct TheTest for (int i = 0; i < R::nlanes; ++i) { EXPECT_FLOAT_EQ((float)std::sqrt(dataA[i]), (float)resB[i]); - EXPECT_FLOAT_EQ((float)1/std::sqrt(dataA[i]), (float)resC[i]); + EXPECT_FLOAT_EQ(1/(float)std::sqrt(dataA[i]), (float)resC[i]); EXPECT_FLOAT_EQ((float)abs(dataA[i]), (float)resE[i]); }