Merge remote-tracking branch 'origin/master' into merge-2.4

This commit is contained in:
Roman Donchenko
2014-08-01 15:01:41 +04:00
450 changed files with 11579 additions and 13407 deletions

View File

@@ -2981,7 +2981,7 @@ The class provides the following features for all derived classes:
* so called "virtual constructor". That is, each Algorithm derivative is registered at program start and you can get the list of registered algorithms and create instance of a particular algorithm by its name (see ``Algorithm::create``). If you plan to add your own algorithms, it is good practice to add a unique prefix to your algorithms to distinguish them from other algorithms.
* setting/retrieving algorithm parameters by name. If you used video capturing functionality from OpenCV highgui module, you are probably familar with ``cvSetCaptureProperty()``, ``cvGetCaptureProperty()``, ``VideoCapture::set()`` and ``VideoCapture::get()``. ``Algorithm`` provides similar method where instead of integer id's you specify the parameter names as text strings. See ``Algorithm::set`` and ``Algorithm::get`` for details.
* setting/retrieving algorithm parameters by name. If you used video capturing functionality from OpenCV videoio module, you are probably familar with ``cvSetCaptureProperty()``, ``cvGetCaptureProperty()``, ``VideoCapture::set()`` and ``VideoCapture::get()``. ``Algorithm`` provides similar method where instead of integer id's you specify the parameter names as text strings. See ``Algorithm::set`` and ``Algorithm::get`` for details.
* reading and writing parameters from/to XML or YAML files. Every Algorithm derivative can store all its parameters and then read them back. There is no need to re-implement it each time.

View File

@@ -361,6 +361,37 @@ The function ``line`` draws the line segment between ``pt1`` and ``pt2`` points
Antialiased lines are drawn using Gaussian filtering.
arrowedLine
----------------
Draws a arrow segment pointing from the first point to the second one.
.. ocv:function:: void arrowedLine(InputOutputArray img, Point pt1, Point pt2, const Scalar& color, int thickness=1, int lineType=8, int shift=0, double tipLength=0.1)
:param img: Image.
:param pt1: The point the arrow starts from.
:param pt2: The point the arrow points to.
:param color: Line color.
:param thickness: Line thickness.
:param lineType: Type of the line:
* **8** (or omitted) - 8-connected line.
* **4** - 4-connected line.
* **CV_AA** - antialiased line.
:param shift: Number of fractional bits in the point coordinates.
:param tipLength: The length of the arrow tip in relation to the arrow length
The function ``arrowedLine`` draws an arrow between ``pt1`` and ``pt2`` points in the image. See also :ocv:func:`line`.
LineIterator
------------
.. ocv:class:: LineIterator

View File

@@ -14,7 +14,8 @@ OpenCV has a modular structure, which means that the package includes several sh
* **calib3d** - basic multiple-view geometry algorithms, single and stereo camera calibration, object pose estimation, stereo correspondence algorithms, and elements of 3D reconstruction.
* **features2d** - salient feature detectors, descriptors, and descriptor matchers.
* **objdetect** - detection of objects and instances of the predefined classes (for example, faces, eyes, mugs, people, cars, and so on).
* **highgui** - an easy-to-use interface to video capturing, image and video codecs, as well as simple UI capabilities.
* **highgui** - an easy-to-use interface to simple UI capabilities.
* **videoio** - an easy-to-use interface to video capturing and video codecs.
* **gpu** - GPU-accelerated algorithms from different OpenCV modules.
* ... some other helper modules, such as FLANN and Google test wrappers, Python bindings, and others.

View File

@@ -510,6 +510,10 @@ CV_EXPORTS_W void randShuffle(InputOutputArray dst, double iterFactor = 1., RNG*
CV_EXPORTS_W void line(InputOutputArray img, Point pt1, Point pt2, const Scalar& color,
int thickness = 1, int lineType = LINE_8, int shift = 0);
//! draws an arrow from pt1 to pt2 in the image
CV_EXPORTS_W void arrowedLine(InputOutputArray img, Point pt1, Point pt2, const Scalar& color,
int thickness=1, int line_type=8, int shift=0, double tipLength=0.1);
//! draws the rectangle outline or a solid rectangle with the opposite corners pt1 and pt2 in the image
CV_EXPORTS_W void rectangle(InputOutputArray img, Point pt1, Point pt2,
const Scalar& color, int thickness = 1,

View File

@@ -244,6 +244,7 @@ typedef signed char schar;
/* fundamental constants */
#define CV_PI 3.1415926535897932384626433832795
#define CV_2PI 6.283185307179586476925286766559
#define CV_LOG2 0.69314718055994530941723212145818
/****************************************************************************************\

View File

@@ -360,7 +360,7 @@ struct CV_EXPORTS UMatData
{
enum { COPY_ON_MAP=1, HOST_COPY_OBSOLETE=2,
DEVICE_COPY_OBSOLETE=4, TEMP_UMAT=8, TEMP_COPIED_UMAT=24,
USER_ALLOCATED=32 };
USER_ALLOCATED=32, DEVICE_MEM_MAPPED=64};
UMatData(const MatAllocator* allocator);
~UMatData();
@@ -370,11 +370,13 @@ struct CV_EXPORTS UMatData
bool hostCopyObsolete() const;
bool deviceCopyObsolete() const;
bool deviceMemMapped() const;
bool copyOnMap() const;
bool tempUMat() const;
bool tempCopiedUMat() const;
void markHostCopyObsolete(bool flag);
void markDeviceCopyObsolete(bool flag);
void markDeviceMemMapped(bool flag);
const MatAllocator* prevAllocator;
const MatAllocator* currAllocator;

View File

@@ -3351,10 +3351,19 @@ size_t UMat::total() const
inline bool UMatData::hostCopyObsolete() const { return (flags & HOST_COPY_OBSOLETE) != 0; }
inline bool UMatData::deviceCopyObsolete() const { return (flags & DEVICE_COPY_OBSOLETE) != 0; }
inline bool UMatData::deviceMemMapped() const { return (flags & DEVICE_MEM_MAPPED) != 0; }
inline bool UMatData::copyOnMap() const { return (flags & COPY_ON_MAP) != 0; }
inline bool UMatData::tempUMat() const { return (flags & TEMP_UMAT) != 0; }
inline bool UMatData::tempCopiedUMat() const { return (flags & TEMP_COPIED_UMAT) == TEMP_COPIED_UMAT; }
inline void UMatData::markDeviceMemMapped(bool flag)
{
if(flag)
flags |= DEVICE_MEM_MAPPED;
else
flags &= ~DEVICE_MEM_MAPPED;
}
inline void UMatData::markHostCopyObsolete(bool flag)
{
if(flag)

View File

@@ -54,23 +54,42 @@ namespace ocl {
///////////// dft ////////////////////////
typedef tuple<Size, int> DftParams;
enum OCL_FFT_TYPE
{
R2R = 0,
C2R = 1,
R2C = 2,
C2C = 3
};
typedef tuple<OCL_FFT_TYPE, Size, int> DftParams;
typedef TestBaseWithParam<DftParams> DftFixture;
OCL_PERF_TEST_P(DftFixture, Dft, ::testing::Combine(Values(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3),
Values((int)DFT_ROWS, (int)DFT_SCALE, (int)DFT_INVERSE,
(int)DFT_INVERSE | DFT_SCALE, (int)DFT_ROWS | DFT_INVERSE)))
OCL_PERF_TEST_P(DftFixture, Dft, ::testing::Combine(Values(C2C, R2R, C2R, R2C),
Values(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3, Size(512, 512), Size(1024, 1024), Size(2048, 2048)),
Values((int) 0, (int)DFT_ROWS, (int)DFT_SCALE, (int)DFT_INVERSE,
(int)DFT_INVERSE | DFT_SCALE, (int)DFT_ROWS | DFT_INVERSE)))
{
const DftParams params = GetParam();
const Size srcSize = get<0>(params);
const int flags = get<1>(params);
const int dft_type = get<0>(params);
const Size srcSize = get<1>(params);
int flags = get<2>(params);
UMat src(srcSize, CV_32FC2), dst(srcSize, CV_32FC2);
int in_cn, out_cn;
switch (dft_type)
{
case R2R: flags |= cv::DFT_REAL_OUTPUT; in_cn = 1; out_cn = 1; break;
case C2R: flags |= cv::DFT_REAL_OUTPUT; in_cn = 2; out_cn = 2; break;
case R2C: flags |= cv::DFT_COMPLEX_OUTPUT; in_cn = 1; out_cn = 2; break;
case C2C: flags |= cv::DFT_COMPLEX_OUTPUT; in_cn = 2; out_cn = 2; break;
}
UMat src(srcSize, CV_MAKE_TYPE(CV_32F, in_cn)), dst(srcSize, CV_MAKE_TYPE(CV_32F, out_cn));
declare.in(src, WARMUP_RNG).out(dst);
OCL_TEST_CYCLE() cv::dft(src, dst, flags | DFT_COMPLEX_OUTPUT);
OCL_TEST_CYCLE() cv::dft(src, dst, flags);
SANITY_CHECK(dst, 1e-3);
SANITY_CHECK(dst, 1e-5, ERROR_RELATIVE);
}
///////////// MulSpectrums ////////////////////////

View File

@@ -139,6 +139,7 @@ OCL_PERF_TEST_P(CopyToFixture, CopyToWithMaskUninit,
dst.release();
startTimer();
src.copyTo(dst, mask);
cv::ocl::finish();
stopTimer();
}

View File

@@ -54,21 +54,23 @@ namespace cv
struct NOP {};
#if CV_SSE2
#if CV_SSE2 || CV_NEON
#define FUNCTOR_TEMPLATE(name) \
template<typename T> struct name {}
FUNCTOR_TEMPLATE(VLoadStore128);
#if CV_SSE2
FUNCTOR_TEMPLATE(VLoadStore64);
FUNCTOR_TEMPLATE(VLoadStore128Aligned);
#endif
#endif
template<typename T, class Op, class VOp>
void vBinOp(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, size_t step, Size sz)
{
#if CV_SSE2
#if CV_SSE2 || CV_NEON
VOp vop;
#endif
Op op;
@@ -79,9 +81,11 @@ void vBinOp(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, si
{
int x = 0;
#if CV_NEON || CV_SSE2
#if CV_SSE2
if( USE_SSE2 )
{
#endif
for( ; x <= sz.width - 32/(int)sizeof(T); x += 32/sizeof(T) )
{
typename VLoadStore128<T>::reg_type r0 = VLoadStore128<T>::load(src1 + x );
@@ -91,8 +95,10 @@ void vBinOp(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, si
VLoadStore128<T>::store(dst + x , r0);
VLoadStore128<T>::store(dst + x + 16/sizeof(T), r1);
}
#if CV_SSE2
}
#endif
#endif
#if CV_SSE2
if( USE_SSE2 )
{
@@ -125,7 +131,7 @@ template<typename T, class Op, class Op32>
void vBinOp32(const T* src1, size_t step1, const T* src2, size_t step2,
T* dst, size_t step, Size sz)
{
#if CV_SSE2
#if CV_SSE2 || CV_NEON
Op32 op32;
#endif
Op op;
@@ -153,9 +159,11 @@ void vBinOp32(const T* src1, size_t step1, const T* src2, size_t step2,
}
}
#endif
#if CV_NEON || CV_SSE2
#if CV_SSE2
if( USE_SSE2 )
{
#endif
for( ; x <= sz.width - 8; x += 8 )
{
typename VLoadStore128<T>::reg_type r0 = VLoadStore128<T>::load(src1 + x );
@@ -165,8 +173,10 @@ void vBinOp32(const T* src1, size_t step1, const T* src2, size_t step2,
VLoadStore128<T>::store(dst + x , r0);
VLoadStore128<T>::store(dst + x + 4, r1);
}
#if CV_SSE2
}
#endif
#endif
#if CV_ENABLE_UNROLLED
for( ; x <= sz.width - 4; x += 4 )
{
@@ -383,7 +393,98 @@ FUNCTOR_TEMPLATE(VNot);
FUNCTOR_CLOSURE_1arg(VNot, uchar, return _mm_xor_si128(_mm_set1_epi32(-1), a));
#endif
#if CV_SSE2
#if CV_NEON
#define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body)\
template <> \
struct name<template_arg>{ \
typedef register_type reg_type; \
static reg_type load(const template_arg * p) { return load_body (p);}; \
static void store(template_arg * p, reg_type v) { store_body (p, v);}; \
}
#define FUNCTOR_CLOSURE_2arg(name, template_arg, body)\
template<> \
struct name<template_arg> \
{ \
VLoadStore128<template_arg>::reg_type operator()( \
VLoadStore128<template_arg>::reg_type a, \
VLoadStore128<template_arg>::reg_type b) const \
{ \
return body; \
}; \
}
#define FUNCTOR_CLOSURE_1arg(name, template_arg, body)\
template<> \
struct name<template_arg> \
{ \
VLoadStore128<template_arg>::reg_type operator()( \
VLoadStore128<template_arg>::reg_type a, \
VLoadStore128<template_arg>::reg_type ) const \
{ \
return body; \
}; \
}
FUNCTOR_LOADSTORE(VLoadStore128, uchar, uint8x16_t, vld1q_u8 , vst1q_u8 );
FUNCTOR_LOADSTORE(VLoadStore128, schar, int8x16_t, vld1q_s8 , vst1q_s8 );
FUNCTOR_LOADSTORE(VLoadStore128, ushort, uint16x8_t, vld1q_u16, vst1q_u16);
FUNCTOR_LOADSTORE(VLoadStore128, short, int16x8_t, vld1q_s16, vst1q_s16);
FUNCTOR_LOADSTORE(VLoadStore128, int, int32x4_t, vld1q_s32, vst1q_s32);
FUNCTOR_LOADSTORE(VLoadStore128, float, float32x4_t, vld1q_f32, vst1q_f32);
FUNCTOR_TEMPLATE(VAdd);
FUNCTOR_CLOSURE_2arg(VAdd, uchar, vqaddq_u8 (a, b));
FUNCTOR_CLOSURE_2arg(VAdd, schar, vqaddq_s8 (a, b));
FUNCTOR_CLOSURE_2arg(VAdd, ushort, vqaddq_u16(a, b));
FUNCTOR_CLOSURE_2arg(VAdd, short, vqaddq_s16(a, b));
FUNCTOR_CLOSURE_2arg(VAdd, int, vaddq_s32 (a, b));
FUNCTOR_CLOSURE_2arg(VAdd, float, vaddq_f32 (a, b));
FUNCTOR_TEMPLATE(VSub);
FUNCTOR_CLOSURE_2arg(VSub, uchar, vqsubq_u8 (a, b));
FUNCTOR_CLOSURE_2arg(VSub, schar, vqsubq_s8 (a, b));
FUNCTOR_CLOSURE_2arg(VSub, ushort, vqsubq_u16(a, b));
FUNCTOR_CLOSURE_2arg(VSub, short, vqsubq_s16(a, b));
FUNCTOR_CLOSURE_2arg(VSub, int, vsubq_s32 (a, b));
FUNCTOR_CLOSURE_2arg(VSub, float, vsubq_f32 (a, b));
FUNCTOR_TEMPLATE(VMin);
FUNCTOR_CLOSURE_2arg(VMin, uchar, vminq_u8 (a, b));
FUNCTOR_CLOSURE_2arg(VMin, schar, vminq_s8 (a, b));
FUNCTOR_CLOSURE_2arg(VMin, ushort, vminq_u16(a, b));
FUNCTOR_CLOSURE_2arg(VMin, short, vminq_s16(a, b));
FUNCTOR_CLOSURE_2arg(VMin, int, vminq_s32(a, b));
FUNCTOR_CLOSURE_2arg(VMin, float, vminq_f32(a, b));
FUNCTOR_TEMPLATE(VMax);
FUNCTOR_CLOSURE_2arg(VMax, uchar, vmaxq_u8 (a, b));
FUNCTOR_CLOSURE_2arg(VMax, schar, vmaxq_s8 (a, b));
FUNCTOR_CLOSURE_2arg(VMax, ushort, vmaxq_u16(a, b));
FUNCTOR_CLOSURE_2arg(VMax, short, vmaxq_s16(a, b));
FUNCTOR_CLOSURE_2arg(VMax, int, vmaxq_s32(a, b));
FUNCTOR_CLOSURE_2arg(VMax, float, vmaxq_f32(a, b));
FUNCTOR_TEMPLATE(VAbsDiff);
FUNCTOR_CLOSURE_2arg(VAbsDiff, uchar, vabdq_u8 (a, b));
FUNCTOR_CLOSURE_2arg(VAbsDiff, schar, vqabsq_s8 (vqsubq_s8(a, b)));
FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort, vabdq_u16 (a, b));
FUNCTOR_CLOSURE_2arg(VAbsDiff, short, vqabsq_s16(vqsubq_s16(a, b)));
FUNCTOR_CLOSURE_2arg(VAbsDiff, int, vabdq_s32 (a, b));
FUNCTOR_CLOSURE_2arg(VAbsDiff, float, vabdq_f32 (a, b));
FUNCTOR_TEMPLATE(VAnd);
FUNCTOR_CLOSURE_2arg(VAnd, uchar, vandq_u8(a, b));
FUNCTOR_TEMPLATE(VOr);
FUNCTOR_CLOSURE_2arg(VOr , uchar, vorrq_u8(a, b));
FUNCTOR_TEMPLATE(VXor);
FUNCTOR_CLOSURE_2arg(VXor, uchar, veorq_u8(a, b));
FUNCTOR_TEMPLATE(VNot);
FUNCTOR_CLOSURE_1arg(VNot, uchar, vmvnq_u8(a ));
#endif
#if CV_SSE2 || CV_NEON
#define IF_SIMD(op) op
#else
#define IF_SIMD(op) NOP
@@ -1390,6 +1491,9 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
if (!doubleSupport && (depth2 == CV_64F || depth1 == CV_64F))
return false;
if( (oclop == OCL_OP_MUL_SCALE || oclop == OCL_OP_DIV_SCALE) && (depth1 >= CV_32F || depth2 >= CV_32F || ddepth >= CV_32F) )
return false;
int kercn = haveMask || haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst);
int scalarcn = kercn == 3 ? 4 : kercn, rowsPerWI = d.isIntel() ? 4 : 1;
@@ -2980,8 +3084,187 @@ void cv::compare(InputArray _src1, InputArray _src2, OutputArray _dst, int op)
namespace cv
{
template<typename T> static void
inRange_(const T* src1, size_t step1, const T* src2, size_t step2,
template <typename T>
struct InRange_SSE
{
int operator () (const T *, const T *, const T *, uchar *, int) const
{
return 0;
}
};
#if CV_SSE2
template <>
struct InRange_SSE<uchar>
{
int operator () (const uchar * src1, const uchar * src2, const uchar * src3,
uchar * dst, int len) const
{
int x = 0;
if (USE_SSE2)
{
__m128i v_full = _mm_set1_epi8(-1), v_128 = _mm_set1_epi8(-128);
for ( ; x <= len - 16; x += 16 )
{
__m128i v_src = _mm_add_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)), v_128);
__m128i v_mask1 = _mm_cmpgt_epi8(_mm_add_epi8(_mm_loadu_si128((const __m128i *)(src2 + x)), v_128), v_src);
__m128i v_mask2 = _mm_cmpgt_epi8(v_src, _mm_add_epi8(_mm_loadu_si128((const __m128i *)(src3 + x)), v_128));
_mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(_mm_or_si128(v_mask1, v_mask2), v_full));
}
}
return x;
}
};
template <>
struct InRange_SSE<schar>
{
int operator () (const schar * src1, const schar * src2, const schar * src3,
uchar * dst, int len) const
{
int x = 0;
if (USE_SSE2)
{
__m128i v_full = _mm_set1_epi8(-1);
for ( ; x <= len - 16; x += 16 )
{
__m128i v_src = _mm_loadu_si128((const __m128i *)(src1 + x));
__m128i v_mask1 = _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src2 + x)), v_src);
__m128i v_mask2 = _mm_cmpgt_epi8(v_src, _mm_loadu_si128((const __m128i *)(src3 + x)));
_mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(_mm_or_si128(v_mask1, v_mask2), v_full));
}
}
return x;
}
};
template <>
struct InRange_SSE<ushort>
{
int operator () (const ushort * src1, const ushort * src2, const ushort * src3,
uchar * dst, int len) const
{
int x = 0;
if (USE_SSE2)
{
__m128i v_zero = _mm_setzero_si128(), v_full = _mm_set1_epi16(-1), v_32768 = _mm_set1_epi16(-32768);
for ( ; x <= len - 8; x += 8 )
{
__m128i v_src = _mm_add_epi16(_mm_loadu_si128((const __m128i *)(src1 + x)), v_32768);
__m128i v_mask1 = _mm_cmpgt_epi16(_mm_add_epi16(_mm_loadu_si128((const __m128i *)(src2 + x)), v_32768), v_src);
__m128i v_mask2 = _mm_cmpgt_epi16(v_src, _mm_add_epi16(_mm_loadu_si128((const __m128i *)(src3 + x)), v_32768));
__m128i v_res = _mm_andnot_si128(_mm_or_si128(v_mask1, v_mask2), v_full);
_mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(_mm_srli_epi16(v_res, 8), v_zero));
}
}
return x;
}
};
template <>
struct InRange_SSE<short>
{
int operator () (const short * src1, const short * src2, const short * src3,
uchar * dst, int len) const
{
int x = 0;
if (USE_SSE2)
{
__m128i v_zero = _mm_setzero_si128(), v_full = _mm_set1_epi16(-1);
for ( ; x <= len - 8; x += 8 )
{
__m128i v_src = _mm_loadu_si128((const __m128i *)(src1 + x));
__m128i v_mask1 = _mm_cmpgt_epi16(_mm_loadu_si128((const __m128i *)(src2 + x)), v_src);
__m128i v_mask2 = _mm_cmpgt_epi16(v_src, _mm_loadu_si128((const __m128i *)(src3 + x)));
__m128i v_res = _mm_andnot_si128(_mm_or_si128(v_mask1, v_mask2), v_full);
_mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(_mm_srli_epi16(v_res, 8), v_zero));
}
}
return x;
}
};
template <>
struct InRange_SSE<int>
{
int operator () (const int * src1, const int * src2, const int * src3,
uchar * dst, int len) const
{
int x = 0;
if (USE_SSE2)
{
__m128i v_zero = _mm_setzero_si128(), v_full = _mm_set1_epi32(-1);
for ( ; x <= len - 8; x += 8 )
{
__m128i v_src = _mm_loadu_si128((const __m128i *)(src1 + x));
__m128i v_res1 = _mm_or_si128(_mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src2 + x)), v_src),
_mm_cmpgt_epi32(v_src, _mm_loadu_si128((const __m128i *)(src3 + x))));
v_src = _mm_loadu_si128((const __m128i *)(src1 + x + 4));
__m128i v_res2 = _mm_or_si128(_mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src2 + x + 4)), v_src),
_mm_cmpgt_epi32(v_src, _mm_loadu_si128((const __m128i *)(src3 + x + 4))));
__m128i v_res = _mm_packs_epi32(_mm_srli_epi32(_mm_andnot_si128(v_res1, v_full), 16),
_mm_srli_epi32(_mm_andnot_si128(v_res2, v_full), 16));
_mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_res, v_zero));
}
}
return x;
}
};
template <>
struct InRange_SSE<float>
{
int operator () (const float * src1, const float * src2, const float * src3,
uchar * dst, int len) const
{
int x = 0;
if (USE_SSE2)
{
__m128i v_zero = _mm_setzero_si128();
for ( ; x <= len - 8; x += 8 )
{
__m128 v_src = _mm_loadu_ps(src1 + x);
__m128 v_res1 = _mm_and_ps(_mm_cmple_ps(_mm_loadu_ps(src2 + x), v_src),
_mm_cmple_ps(v_src, _mm_loadu_ps(src3 + x)));
v_src = _mm_loadu_ps(src1 + x + 4);
__m128 v_res2 = _mm_and_ps(_mm_cmple_ps(_mm_loadu_ps(src2 + x + 4), v_src),
_mm_cmple_ps(v_src, _mm_loadu_ps(src3 + x + 4)));
__m128i v_res1i = _mm_cvtps_epi32(v_res1), v_res2i = _mm_cvtps_epi32(v_res2);
__m128i v_res = _mm_packs_epi32(_mm_srli_epi32(v_res1i, 16), _mm_srli_epi32(v_res2i, 16));
_mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_res, v_zero));
}
}
return x;
}
};
#endif
template <typename T>
static void inRange_(const T* src1, size_t step1, const T* src2, size_t step2,
const T* src3, size_t step3, uchar* dst, size_t step,
Size size)
{
@@ -2989,9 +3272,11 @@ inRange_(const T* src1, size_t step1, const T* src2, size_t step2,
step2 /= sizeof(src2[0]);
step3 /= sizeof(src3[0]);
InRange_SSE<T> vop;
for( ; size.height--; src1 += step1, src2 += step2, src3 += step3, dst += step )
{
int x = 0;
int x = vop(src1, src2, src3, dst, size.width);
#if CV_ENABLE_UNROLLED
for( ; x <= size.width - 4; x += 4 )
{

View File

@@ -1541,7 +1541,7 @@ static bool ocl_convertScaleAbs( InputArray _src, OutputArray _dst, double alpha
kercn = ocl::predictOptimalVectorWidth(_src, _dst), rowsPerWI = d.isIntel() ? 4 : 1;
bool doubleSupport = d.doubleFPConfig() > 0;
if (!doubleSupport && depth == CV_64F)
if (depth == CV_32F || depth == CV_64F)
return false;
char cvt[2][50];
@@ -1729,22 +1729,18 @@ static bool ocl_LUT(InputArray _src, InputArray _lut, OutputArray _dst)
UMat src = _src.getUMat(), lut = _lut.getUMat();
_dst.create(src.size(), CV_MAKETYPE(ddepth, dcn));
UMat dst = _dst.getUMat();
bool bAligned = (1 == lcn) && (0 == (src.offset % 4)) && (0 == ((dcn * src.cols) % 4));
// dst.cols == src.cols by params of dst.create
int kercn = lcn == 1 ? std::min(4, ocl::predictOptimalVectorWidth(_dst)) : dcn;
ocl::Kernel k("LUT", ocl::core::lut_oclsrc,
format("-D dcn=%d -D lcn=%d -D srcT=%s -D dstT=%s", bAligned ? 4 : dcn, lcn,
ocl::typeToStr(src.depth()), ocl::memopTypeToStr(ddepth)
));
format("-D dcn=%d -D lcn=%d -D srcT=%s -D dstT=%s", kercn, lcn,
ocl::typeToStr(src.depth()), ocl::memopTypeToStr(ddepth)));
if (k.empty())
return false;
int cols = bAligned ? dcn * dst.cols / 4 : dst.cols;
k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::ReadOnlyNoSize(lut),
ocl::KernelArg::WriteOnlyNoSize(dst), dst.rows, cols);
ocl::KernelArg::WriteOnly(dst, dcn, kercn));
size_t globalSize[2] = { cols, (dst.rows + 3) / 4 };
size_t globalSize[2] = { dst.cols * dcn / kercn, (dst.rows + 3) / 4 };
return k.run(2, globalSize, NULL, false);
}

View File

@@ -432,7 +432,7 @@ Mat& Mat::setTo(InputArray _value, InputArray _mask)
IppStatus status = (IppStatus)-1;
IppiSize roisize = { cols, rows };
int mstep = (int)mask.step, dstep = (int)step;
int mstep = (int)mask.step[0], dstep = (int)step[0];
if (isContinuous() && mask.isContinuous())
{
@@ -616,7 +616,7 @@ static bool ocl_flip(InputArray _src, OutputArray _dst, int flipCode )
{
CV_Assert(flipCode >= -1 && flipCode <= 1);
int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
flipType, kercn = std::min(ocl::predictOptimalVectorWidth(_src, _dst), 4);;
flipType, kercn = std::min(ocl::predictOptimalVectorWidth(_src, _dst), 4);
if (cn > 4)
return false;
@@ -631,7 +631,7 @@ static bool ocl_flip(InputArray _src, OutputArray _dst, int flipCode )
ocl::Device dev = ocl::Device::getDefault();
int pxPerWIy = (dev.isIntel() && (dev.type() & ocl::Device::TYPE_GPU)) ? 4 : 1;
kercn = std::max(kercn, cn);
kercn = (cn!=3 || flipType == FLIP_ROWS) ? std::max(kercn, cn) : cn;
ocl::Kernel k(kernelName, ocl::core::flip_oclsrc,
format( "-D T=%s -D T1=%s -D cn=%d -D PIX_PER_WI_Y=%d -D kercn=%d",
@@ -762,7 +762,7 @@ void flip( InputArray _src, OutputArray _dst, int flip_mode )
flipHoriz( dst.data, dst.step, dst.data, dst.step, dst.size(), esz );
}
#ifdef HAVE_OPENCL
/*#ifdef HAVE_OPENCL
static bool ocl_repeat(InputArray _src, int ny, int nx, OutputArray _dst)
{
@@ -790,7 +790,7 @@ static bool ocl_repeat(InputArray _src, int ny, int nx, OutputArray _dst)
return k.run(2, globalsize, NULL, false);
}
#endif
#endif*/
void repeat(InputArray _src, int ny, int nx, OutputArray _dst)
{
@@ -800,8 +800,8 @@ void repeat(InputArray _src, int ny, int nx, OutputArray _dst)
Size ssize = _src.size();
_dst.create(ssize.height*ny, ssize.width*nx, _src.type());
CV_OCL_RUN(_dst.isUMat(),
ocl_repeat(_src, ny, nx, _dst))
/*CV_OCL_RUN(_dst.isUMat(),
ocl_repeat(_src, ny, nx, _dst))*/
Mat src = _src.getMat(), dst = _dst.getMat();
Size dsize = dst.size();

View File

@@ -207,7 +207,6 @@ namespace
MemoryStack* MemoryPool::getFreeMemStack()
{
AutoLock lock(mtx_);
if (!initialized_)
initilizeImpl();
@@ -256,22 +255,31 @@ namespace
namespace
{
Mutex mtx_;
bool memory_pool_manager_initialized;
class MemoryPoolManager
{
public:
MemoryPoolManager();
~MemoryPoolManager();
void Init();
MemoryPool* getPool(int deviceId);
private:
std::vector<MemoryPool> pools_;
};
} manager;
//MemoryPoolManager ;
MemoryPoolManager::MemoryPoolManager()
{
int deviceCount = getCudaEnabledDeviceCount();
}
void MemoryPoolManager::Init()
{
int deviceCount = getCudaEnabledDeviceCount();
if (deviceCount > 0)
pools_.resize(deviceCount);
}
@@ -280,7 +288,7 @@ namespace
{
for (size_t i = 0; i < pools_.size(); ++i)
{
cudaSetDevice(i);
cudaSetDevice(static_cast<int>(i));
pools_[i].release();
}
}
@@ -293,7 +301,14 @@ namespace
MemoryPool* memPool(int deviceId)
{
static MemoryPoolManager manager;
{
AutoLock lock(mtx_);
if (!memory_pool_manager_initialized)
{
memory_pool_manager_initialized = true;
manager.Init();
}
}
return manager.getPool(deviceId);
}
}
@@ -311,8 +326,10 @@ cv::cuda::StackAllocator::StackAllocator(cudaStream_t stream) : stream_(stream),
if (enableMemoryPool)
{
const int deviceId = getDevice();
memStack_ = memPool(deviceId)->getFreeMemStack();
{
AutoLock lock(mtx_);
memStack_ = memPool(deviceId)->getFreeMemStack();
}
DeviceInfo devInfo(deviceId);
alignment_ = devInfo.textureAlignment();
}

View File

@@ -190,10 +190,22 @@ void cv::cuda::Stream::enqueueHostCallback(StreamCallback callback, void* userDa
#endif
}
namespace
{
bool default_stream_is_initialized;
Mutex mtx;
Ptr<Stream> default_stream;
}
Stream& cv::cuda::Stream::Null()
{
static Stream s(Ptr<Impl>(new Impl(0)));
return s;
AutoLock lock(mtx);
if (!default_stream_is_initialized)
{
default_stream = Ptr<Stream>(new Stream(Ptr<Impl>(new Impl(0))));
default_stream_is_initialized = true;
}
return *default_stream;
}
cv::cuda::Stream::operator bool_type() const

View File

@@ -1584,6 +1584,24 @@ void line( InputOutputArray _img, Point pt1, Point pt2, const Scalar& color,
ThickLine( img, pt1, pt2, buf, thickness, line_type, 3, shift );
}
void arrowedLine(InputOutputArray img, Point pt1, Point pt2, const Scalar& color,
int thickness, int line_type, int shift, double tipLength)
{
const double tipSize = norm(pt1-pt2)*tipLength; // Factor to normalize the size of the tip depending on the length of the arrow
line(img, pt1, pt2, color, thickness, line_type, shift);
const double angle = atan2( (double) pt1.y - pt2.y, (double) pt1.x - pt2.x );
Point p(cvRound(pt2.x + tipSize * cos(angle + CV_PI / 4)),
cvRound(pt2.y + tipSize * sin(angle + CV_PI / 4)));
line(img, p, pt2, color, thickness, line_type, shift);
p.x = cvRound(pt2.x + tipSize * cos(angle - CV_PI / 4));
p.y = cvRound(pt2.y + tipSize * sin(angle - CV_PI / 4));
line(img, p, pt2, color, thickness, line_type, shift);
}
void rectangle( InputOutputArray _img, Point pt1, Point pt2,
const Scalar& color, int thickness,
int lineType, int shift )

View File

@@ -43,6 +43,7 @@
#include "opencv2/core/opencl/runtime/opencl_clamdfft.hpp"
#include "opencv2/core/opencl/runtime/opencl_core.hpp"
#include "opencl_kernels.hpp"
#include <map>
namespace cv
{
@@ -1781,6 +1782,375 @@ static bool ippi_DFT_R_32F(const Mat& src, Mat& dst, bool inv, int norm_flag)
#endif
}
#ifdef HAVE_OPENCL
namespace cv
{
enum FftType
{
R2R = 0, // real to CCS in case forward transform, CCS to real otherwise
C2R = 1, // complex to real in case inverse transform
R2C = 2, // real to complex in case forward transform
C2C = 3 // complex to complex
};
struct OCL_FftPlan
{
private:
UMat twiddles;
String buildOptions;
int thread_count;
bool status;
int dft_size;
public:
OCL_FftPlan(int _size): dft_size(_size), status(true)
{
int min_radix;
std::vector<int> radixes, blocks;
ocl_getRadixes(dft_size, radixes, blocks, min_radix);
thread_count = dft_size / min_radix;
if (thread_count > (int) ocl::Device::getDefault().maxWorkGroupSize())
{
status = false;
return;
}
// generate string with radix calls
String radix_processing;
int n = 1, twiddle_size = 0;
for (size_t i=0; i<radixes.size(); i++)
{
int radix = radixes[i], block = blocks[i];
if (block > 1)
radix_processing += format("fft_radix%d_B%d(smem,twiddles+%d,ind,%d,%d);", radix, block, twiddle_size, n, dft_size/radix);
else
radix_processing += format("fft_radix%d(smem,twiddles+%d,ind,%d,%d);", radix, twiddle_size, n, dft_size/radix);
twiddle_size += (radix-1)*n;
n *= radix;
}
Mat tw(1, twiddle_size, CV_32FC2);
float* ptr = tw.ptr<float>();
int ptr_index = 0;
n = 1;
for (size_t i=0; i<radixes.size(); i++)
{
int radix = radixes[i];
n *= radix;
for (int j=1; j<radix; j++)
{
double theta = -CV_2PI*j/n;
for (int k=0; k<(n/radix); k++)
{
ptr[ptr_index++] = (float) cos(k*theta);
ptr[ptr_index++] = (float) sin(k*theta);
}
}
}
twiddles = tw.getUMat(ACCESS_READ);
buildOptions = format("-D LOCAL_SIZE=%d -D kercn=%d -D RADIX_PROCESS=%s",
dft_size, min_radix, radix_processing.c_str());
}
bool enqueueTransform(InputArray _src, OutputArray _dst, int num_dfts, int flags, int fftType, bool rows = true) const
{
if (!status)
return false;
UMat src = _src.getUMat();
UMat dst = _dst.getUMat();
size_t globalsize[2];
size_t localsize[2];
String kernel_name;
bool is1d = (flags & DFT_ROWS) != 0 || num_dfts == 1;
bool inv = (flags & DFT_INVERSE) != 0;
String options = buildOptions;
if (rows)
{
globalsize[0] = thread_count; globalsize[1] = src.rows;
localsize[0] = thread_count; localsize[1] = 1;
kernel_name = !inv ? "fft_multi_radix_rows" : "ifft_multi_radix_rows";
if ((is1d || inv) && (flags & DFT_SCALE))
options += " -D DFT_SCALE";
}
else
{
globalsize[0] = num_dfts; globalsize[1] = thread_count;
localsize[0] = 1; localsize[1] = thread_count;
kernel_name = !inv ? "fft_multi_radix_cols" : "ifft_multi_radix_cols";
if (flags & DFT_SCALE)
options += " -D DFT_SCALE";
}
options += src.channels() == 1 ? " -D REAL_INPUT" : " -D COMPLEX_INPUT";
options += dst.channels() == 1 ? " -D REAL_OUTPUT" : " -D COMPLEX_OUTPUT";
options += is1d ? " -D IS_1D" : "";
if (!inv)
{
if ((is1d && src.channels() == 1) || (rows && (fftType == R2R)))
options += " -D NO_CONJUGATE";
}
else
{
if (rows && (fftType == C2R || fftType == R2R))
options += " -D NO_CONJUGATE";
if (dst.cols % 2 == 0)
options += " -D EVEN";
}
ocl::Kernel k(kernel_name.c_str(), ocl::core::fft_oclsrc, options);
if (k.empty())
return false;
k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(twiddles), thread_count, num_dfts);
return k.run(2, globalsize, localsize, false);
}
private:
static void ocl_getRadixes(int cols, std::vector<int>& radixes, std::vector<int>& blocks, int& min_radix)
{
int factors[34];
int nf = DFTFactorize(cols, factors);
int n = 1;
int factor_index = 0;
min_radix = INT_MAX;
// 2^n transforms
if ((factors[factor_index] & 1) == 0)
{
for( ; n < factors[factor_index];)
{
int radix = 2, block = 1;
if (8*n <= factors[0])
radix = 8;
else if (4*n <= factors[0])
{
radix = 4;
if (cols % 12 == 0)
block = 3;
else if (cols % 8 == 0)
block = 2;
}
else
{
if (cols % 10 == 0)
block = 5;
else if (cols % 8 == 0)
block = 4;
else if (cols % 6 == 0)
block = 3;
else if (cols % 4 == 0)
block = 2;
}
radixes.push_back(radix);
blocks.push_back(block);
min_radix = min(min_radix, block*radix);
n *= radix;
}
factor_index++;
}
// all the other transforms
for( ; factor_index < nf; factor_index++)
{
int radix = factors[factor_index], block = 1;
if (radix == 3)
{
if (cols % 12 == 0)
block = 4;
else if (cols % 9 == 0)
block = 3;
else if (cols % 6 == 0)
block = 2;
}
else if (radix == 5)
{
if (cols % 10 == 0)
block = 2;
}
radixes.push_back(radix);
blocks.push_back(block);
min_radix = min(min_radix, block*radix);
}
}
};
class OCL_FftPlanCache
{
public:
static OCL_FftPlanCache & getInstance()
{
static OCL_FftPlanCache planCache;
return planCache;
}
Ptr<OCL_FftPlan> getFftPlan(int dft_size)
{
std::map<int, Ptr<OCL_FftPlan> >::iterator f = planStorage.find(dft_size);
if (f != planStorage.end())
{
return f->second;
}
else
{
Ptr<OCL_FftPlan> newPlan = Ptr<OCL_FftPlan>(new OCL_FftPlan(dft_size));
planStorage[dft_size] = newPlan;
return newPlan;
}
}
~OCL_FftPlanCache()
{
planStorage.clear();
}
protected:
OCL_FftPlanCache() :
planStorage()
{
}
std::map<int, Ptr<OCL_FftPlan> > planStorage;
};
static bool ocl_dft_rows(InputArray _src, OutputArray _dst, int nonzero_rows, int flags, int fftType)
{
Ptr<OCL_FftPlan> plan = OCL_FftPlanCache::getInstance().getFftPlan(_src.cols());
return plan->enqueueTransform(_src, _dst, nonzero_rows, flags, fftType, true);
}
static bool ocl_dft_cols(InputArray _src, OutputArray _dst, int nonzero_cols, int flags, int fftType)
{
Ptr<OCL_FftPlan> plan = OCL_FftPlanCache::getInstance().getFftPlan(_src.rows());
return plan->enqueueTransform(_src, _dst, nonzero_cols, flags, fftType, false);
}
static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_rows)
{
int type = _src.type(), cn = CV_MAT_CN(type);
Size ssize = _src.size();
if ( !(type == CV_32FC1 || type == CV_32FC2) )
return false;
// if is not a multiplication of prime numbers { 2, 3, 5 }
if (ssize.area() != getOptimalDFTSize(ssize.area()))
return false;
UMat src = _src.getUMat();
int complex_input = cn == 2 ? 1 : 0;
int complex_output = (flags & DFT_COMPLEX_OUTPUT) != 0;
int real_input = cn == 1 ? 1 : 0;
int real_output = (flags & DFT_REAL_OUTPUT) != 0;
bool inv = (flags & DFT_INVERSE) != 0 ? 1 : 0;
if( nonzero_rows <= 0 || nonzero_rows > _src.rows() )
nonzero_rows = _src.rows();
bool is1d = (flags & DFT_ROWS) != 0 || nonzero_rows == 1;
// if output format is not specified
if (complex_output + real_output == 0)
{
if (real_input)
real_output = 1;
else
complex_output = 1;
}
FftType fftType = (FftType)(complex_input << 0 | complex_output << 1);
// Forward Complex to CCS not supported
if (fftType == C2R && !inv)
fftType = C2C;
// Inverse CCS to Complex not supported
if (fftType == R2C && inv)
fftType = R2R;
UMat output;
if (fftType == C2C || fftType == R2C)
{
// complex output
_dst.create(src.size(), CV_32FC2);
output = _dst.getUMat();
}
else
{
// real output
if (is1d)
{
_dst.create(src.size(), CV_32FC1);
output = _dst.getUMat();
}
else
{
_dst.create(src.size(), CV_32FC1);
output.create(src.size(), CV_32FC2);
}
}
if (!inv)
{
if (!ocl_dft_rows(src, output, nonzero_rows, flags, fftType))
return false;
if (!is1d)
{
int nonzero_cols = fftType == R2R ? output.cols/2 + 1 : output.cols;
if (!ocl_dft_cols(output, _dst, nonzero_cols, flags, fftType))
return false;
}
}
else
{
if (fftType == C2C)
{
// complex output
if (!ocl_dft_rows(src, output, nonzero_rows, flags, fftType))
return false;
if (!is1d)
{
if (!ocl_dft_cols(output, output, output.cols, flags, fftType))
return false;
}
}
else
{
if (is1d)
{
if (!ocl_dft_rows(src, output, nonzero_rows, flags, fftType))
return false;
}
else
{
int nonzero_cols = src.cols/2 + 1;
if (!ocl_dft_cols(src, output, nonzero_cols, flags, fftType))
return false;
if (!ocl_dft_rows(output, _dst, nonzero_rows, flags, fftType))
return false;
}
}
}
return true;
}
} // namespace cv;
#endif
#ifdef HAVE_CLAMDFFT
namespace cv {
@@ -1791,14 +2161,6 @@ namespace cv {
CV_Assert(s == CLFFT_SUCCESS); \
}
enum FftType
{
R2R = 0, // real to real
C2R = 1, // opencl HERMITIAN_INTERLEAVED to real
R2C = 2, // real to opencl HERMITIAN_INTERLEAVED
C2C = 3 // complex to complex
};
class PlanCache
{
struct FftPlan
@@ -1923,7 +2285,7 @@ public:
}
// no baked plan is found, so let's create a new one
FftPlan * newPlan = new FftPlan(dft_size, src_step, dst_step, doubleFP, inplace, flags, fftType);
Ptr<FftPlan> newPlan = Ptr<FftPlan>(new FftPlan(dft_size, src_step, dst_step, doubleFP, inplace, flags, fftType));
planStorage.push_back(newPlan);
return newPlan->plHandle;
@@ -1931,8 +2293,6 @@ public:
~PlanCache()
{
for (std::vector<FftPlan *>::iterator i = planStorage.begin(), end = planStorage.end(); i != end; ++i)
delete (*i);
planStorage.clear();
}
@@ -1942,7 +2302,7 @@ protected:
{
}
std::vector<FftPlan *> planStorage;
std::vector<Ptr<FftPlan> > planStorage;
};
extern "C" {
@@ -1960,7 +2320,7 @@ static void CL_CALLBACK oclCleanupCallback(cl_event e, cl_int, void *p)
}
static bool ocl_dft(InputArray _src, OutputArray _dst, int flags)
static bool ocl_dft_amdfft(InputArray _src, OutputArray _dst, int flags)
{
int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
Size ssize = _src.size();
@@ -2019,7 +2379,6 @@ static bool ocl_dft(InputArray _src, OutputArray _dst, int flags)
tmpBuffer.addref();
clSetEventCallback(e, CL_COMPLETE, oclCleanupCallback, tmpBuffer.u);
return true;
}
@@ -2034,7 +2393,12 @@ void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows )
#ifdef HAVE_CLAMDFFT
CV_OCL_RUN(ocl::haveAmdFft() && ocl::Device::getDefault().type() != ocl::Device::TYPE_CPU &&
_dst.isUMat() && _src0.dims() <= 2 && nonzero_rows == 0,
ocl_dft(_src0, _dst, flags))
ocl_dft_amdfft(_src0, _dst, flags))
#endif
#ifdef HAVE_OPENCL
CV_OCL_RUN(_dst.isUMat() && _src0.dims() <= 2,
ocl_dft(_src0, _dst, flags, nonzero_rows))
#endif
static DFTFunc dft_tbl[6] =
@@ -2046,10 +2410,8 @@ void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows )
(DFTFunc)RealDFT_64f,
(DFTFunc)CCSIDFT_64f
};
AutoBuffer<uchar> buf;
void *spec = 0;
Mat src0 = _src0.getMat(), src = src0;
int prev_len = 0, stage = 0;
bool inv = (flags & DFT_INVERSE) != 0;
@@ -2080,32 +2442,32 @@ void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows )
{
if ((flags & DFT_ROWS) == 0)
{
if (!real_transform)
if (src.channels() == 2 && !(inv && (flags & DFT_REAL_OUTPUT)))
{
if (ippi_DFT_C_32F(src,dst, inv, ipp_norm_flag))
if (ippi_DFT_C_32F(src, dst, inv, ipp_norm_flag))
return;
setIppErrorStatus();
}
else if (inv || !(flags & DFT_COMPLEX_OUTPUT))
if (src.channels() == 1 && (inv || !(flags & DFT_COMPLEX_OUTPUT)))
{
if (ippi_DFT_R_32F(src,dst, inv, ipp_norm_flag))
if (ippi_DFT_R_32F(src, dst, inv, ipp_norm_flag))
return;
setIppErrorStatus();
}
}
else
{
if (!real_transform)
if (src.channels() == 2 && !(inv && (flags & DFT_REAL_OUTPUT)))
{
ippiDFT_C_Func ippiFunc = inv ? (ippiDFT_C_Func)ippiDFTInv_CToC_32fc_C1R : (ippiDFT_C_Func)ippiDFTFwd_CToC_32fc_C1R;
if (Dft_C_IPPLoop(src,dst, IPPDFT_C_Functor(ippiFunc),ipp_norm_flag))
if (Dft_C_IPPLoop(src, dst, IPPDFT_C_Functor(ippiFunc),ipp_norm_flag))
return;
setIppErrorStatus();
}
else if (inv || !(flags & DFT_COMPLEX_OUTPUT))
if (src.channels() == 1 && (inv || !(flags & DFT_COMPLEX_OUTPUT)))
{
ippiDFT_R_Func ippiFunc = inv ? (ippiDFT_R_Func)ippiDFTInv_PackToR_32f_C1R : (ippiDFT_R_Func)ippiDFTFwd_RToPack_32f_C1R;
if (Dft_R_IPPLoop(src,dst, IPPDFT_R_Functor(ippiFunc),ipp_norm_flag))
if (Dft_R_IPPLoop(src, dst, IPPDFT_R_Functor(ippiFunc),ipp_norm_flag))
return;
setIppErrorStatus();
}

View File

@@ -348,7 +348,18 @@ static void InvSqrt_32f(const float* src, float* dst, int len)
static void InvSqrt_64f(const double* src, double* dst, int len)
{
for( int i = 0; i < len; i++ )
int i = 0;
#if CV_SSE2
if (USE_SSE2)
{
__m128d v_1 = _mm_set1_pd(1.0);
for ( ; i <= len - 2; i += 2)
_mm_storeu_pd(dst + i, _mm_div_pd(v_1, _mm_sqrt_pd(_mm_loadu_pd(src + i))));
}
#endif
for( ; i < len; i++ )
dst[i] = 1/std::sqrt(src[i]);
}
@@ -2543,12 +2554,33 @@ void patchNaNs( InputOutputArray _a, double _val )
NAryMatIterator it(arrays, (uchar**)ptrs);
size_t len = it.size*a.channels();
Cv32suf val;
val.f = (float)_val;
float fval = (float)_val;
val.f = fval;
#if CV_SSE2
__m128i v_mask1 = _mm_set1_epi32(0x7fffffff), v_mask2 = _mm_set1_epi32(0x7f800000);
__m128i v_val = _mm_set1_epi32(val.i);
#endif
for( size_t i = 0; i < it.nplanes; i++, ++it )
{
int* tptr = ptrs[0];
for( size_t j = 0; j < len; j++ )
size_t j = 0;
#if CV_SSE2
if (USE_SSE2)
{
for ( ; j < len; j += 4)
{
__m128i v_src = _mm_loadu_si128((__m128i const *)(tptr + j));
__m128i v_cmp_mask = _mm_cmplt_epi32(v_mask2, _mm_and_si128(v_src, v_mask1));
__m128i v_res = _mm_or_si128(_mm_andnot_si128(v_cmp_mask, v_src), _mm_and_si128(v_cmp_mask, v_val));
_mm_storeu_si128((__m128i *)(tptr + j), v_res);
}
}
#endif
for( ; j < len; j++ )
if( (tptr[j] & 0x7fffffff) > 0x7f800000 )
tptr[j] = val.i;
}

View File

@@ -2758,21 +2758,30 @@ namespace cv {
static bool ocl_setIdentity( InputOutputArray _m, const Scalar& s )
{
int type = _m.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
sctype = CV_MAKE_TYPE(depth, cn == 3 ? 4 : cn),
int type = _m.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), kercn = cn;
if (cn == 1)
{
kercn = std::min(ocl::predictOptimalVectorWidth(_m), 4);
if (kercn != 4)
kercn = 1;
}
int sctype = CV_MAKE_TYPE(depth, cn == 3 ? 4 : cn),
rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1;
ocl::Kernel k("setIdentity", ocl::core::set_identity_oclsrc,
format("-D T=%s -D T1=%s -D cn=%d -D ST=%s", ocl::memopTypeToStr(type),
ocl::memopTypeToStr(depth), cn, ocl::memopTypeToStr(sctype)));
format("-D T=%s -D T1=%s -D cn=%d -D ST=%s -D kercn=%d -D rowsPerWI=%d",
ocl::memopTypeToStr(CV_MAKE_TYPE(depth, kercn)),
ocl::memopTypeToStr(depth), cn,
ocl::memopTypeToStr(sctype),
kercn, rowsPerWI));
if (k.empty())
return false;
UMat m = _m.getUMat();
k.args(ocl::KernelArg::WriteOnly(m), ocl::KernelArg::Constant(Mat(1, 1, sctype, s)),
rowsPerWI);
k.args(ocl::KernelArg::WriteOnly(m, cn, kercn),
ocl::KernelArg::Constant(Mat(1, 1, sctype, s)));
size_t globalsize[2] = { m.cols, (m.rows + rowsPerWI - 1) / rowsPerWI };
size_t globalsize[2] = { m.cols * cn / kercn, (m.rows + rowsPerWI - 1) / rowsPerWI };
return k.run(2, globalsize, NULL, false);
}
@@ -3327,7 +3336,7 @@ static inline void reduceSumC_8u16u16s32f_64f(const cv::Mat& srcmat, cv::Mat& ds
stype == CV_32FC3 ? (ippiSumHint)ippiSum_32f_C3R :
stype == CV_32FC4 ? (ippiSumHint)ippiSum_32f_C4R : 0;
func =
sdepth == CV_8U ? (cv::ReduceFunc)cv::reduceC_<uchar, double, cv::OpAdd<double> > :
sdepth == CV_8U ? (cv::ReduceFunc)cv::reduceC_<uchar, double, cv::OpAdd<double> > :
sdepth == CV_16U ? (cv::ReduceFunc)cv::reduceC_<ushort, double, cv::OpAdd<double> > :
sdepth == CV_16S ? (cv::ReduceFunc)cv::reduceC_<short, double, cv::OpAdd<double> > :
sdepth == CV_32F ? (cv::ReduceFunc)cv::reduceC_<float, double, cv::OpAdd<double> > : 0;
@@ -3441,12 +3450,18 @@ static bool ocl_reduce(InputArray _src, OutputArray _dst,
const int min_opt_cols = 128, buf_cols = 32;
int sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype),
ddepth = CV_MAT_DEPTH(dtype), ddepth0 = ddepth;
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0,
useOptimized = 1 == dim && _src.cols() > min_opt_cols;
const ocl::Device &defDev = ocl::Device::getDefault();
bool doubleSupport = defDev.doubleFPConfig() > 0;
size_t wgs = defDev.maxWorkGroupSize();
bool useOptimized = 1 == dim && _src.cols() > min_opt_cols && (wgs >= buf_cols);
if (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F))
return false;
if ((op == CV_REDUCE_SUM && sdepth == CV_32F) || op == CV_REDUCE_MIN || op == CV_REDUCE_MAX)
return false;
if (op == CV_REDUCE_AVG)
{
if (sdepth < CV_32S && ddepth < CV_32S)
@@ -3455,78 +3470,80 @@ static bool ocl_reduce(InputArray _src, OutputArray _dst,
const char * const ops[4] = { "OCL_CV_REDUCE_SUM", "OCL_CV_REDUCE_AVG",
"OCL_CV_REDUCE_MAX", "OCL_CV_REDUCE_MIN" };
char cvt[2][40];
int wdepth = std::max(ddepth, CV_32F);
cv::String build_opt = format("-D %s -D dim=%d -D cn=%d -D ddepth=%d"
" -D srcT=%s -D dstT=%s -D dstT0=%s -D convertToWT=%s"
" -D convertToDT=%s -D convertToDT0=%s%s",
ops[op], dim, cn, ddepth, ocl::typeToStr(useOptimized ? ddepth : sdepth),
ocl::typeToStr(ddepth), ocl::typeToStr(ddepth0),
ocl::convertTypeStr(ddepth, wdepth, 1, cvt[0]),
ocl::convertTypeStr(sdepth, ddepth, 1, cvt[0]),
ocl::convertTypeStr(wdepth, ddepth0, 1, cvt[1]),
doubleSupport ? " -D DOUBLE_SUPPORT" : "");
if (useOptimized)
{
cv::String build_opt_pre = format("-D OP_REDUCE_PRE -D BUF_COLS=%d -D %s -D dim=1"
" -D cn=%d -D ddepth=%d -D srcT=%s -D dstT=%s -D convertToDT=%s%s",
buf_cols, ops[op], cn, ddepth, ocl::typeToStr(sdepth), ocl::typeToStr(ddepth),
ocl::convertTypeStr(sdepth, ddepth, 1, cvt[0]),
doubleSupport ? " -D DOUBLE_SUPPORT" : "");
ocl::Kernel kpre("reduce_horz_pre", ocl::core::reduce2_oclsrc, build_opt_pre);
if (kpre.empty())
size_t tileHeight = (size_t)(wgs / buf_cols);
if (defDev.isIntel())
{
static const size_t maxItemInGroupCount = 16;
tileHeight = min(tileHeight, defDev.localMemSize() / buf_cols / CV_ELEM_SIZE(CV_MAKETYPE(wdepth, cn)) / maxItemInGroupCount);
}
char cvt[3][40];
cv::String build_opt = format("-D OP_REDUCE_PRE -D BUF_COLS=%d -D TILE_HEIGHT=%d -D %s -D dim=1"
" -D cn=%d -D ddepth=%d"
" -D srcT=%s -D bufT=%s -D dstT=%s"
" -D convertToWT=%s -D convertToBufT=%s -D convertToDT=%s%s",
buf_cols, tileHeight, ops[op], cn, ddepth,
ocl::typeToStr(sdepth),
ocl::typeToStr(ddepth),
ocl::typeToStr(ddepth0),
ocl::convertTypeStr(ddepth, wdepth, 1, cvt[0]),
ocl::convertTypeStr(sdepth, ddepth, 1, cvt[1]),
ocl::convertTypeStr(wdepth, ddepth0, 1, cvt[2]),
doubleSupport ? " -D DOUBLE_SUPPORT" : "");
ocl::Kernel k("reduce_horz_opt", ocl::core::reduce2_oclsrc, build_opt);
if (k.empty())
return false;
ocl::Kernel kmain("reduce", ocl::core::reduce2_oclsrc, build_opt);
if (kmain.empty())
return false;
UMat src = _src.getUMat();
Size dsize(1, src.rows);
_dst.create(dsize, dtype);
UMat dst = _dst.getUMat();
UMat buf(src.rows, buf_cols, dst.type());
kpre.args(ocl::KernelArg::ReadOnly(src),
ocl::KernelArg::WriteOnlyNoSize(buf));
if (op0 == CV_REDUCE_AVG)
k.args(ocl::KernelArg::ReadOnly(src),
ocl::KernelArg::WriteOnlyNoSize(dst), 1.0f / src.cols);
else
k.args(ocl::KernelArg::ReadOnly(src),
ocl::KernelArg::WriteOnlyNoSize(dst));
size_t localSize[2] = { buf_cols, tileHeight};
size_t globalSize[2] = { buf_cols, src.rows };
if (!kpre.run(2, globalSize, NULL, false))
return k.run(2, globalSize, localSize, false);
}
else
{
char cvt[2][40];
cv::String build_opt = format("-D %s -D dim=%d -D cn=%d -D ddepth=%d"
" -D srcT=%s -D dstT=%s -D dstT0=%s -D convertToWT=%s"
" -D convertToDT=%s -D convertToDT0=%s%s",
ops[op], dim, cn, ddepth, ocl::typeToStr(useOptimized ? ddepth : sdepth),
ocl::typeToStr(ddepth), ocl::typeToStr(ddepth0),
ocl::convertTypeStr(ddepth, wdepth, 1, cvt[0]),
ocl::convertTypeStr(sdepth, ddepth, 1, cvt[0]),
ocl::convertTypeStr(wdepth, ddepth0, 1, cvt[1]),
doubleSupport ? " -D DOUBLE_SUPPORT" : "");
ocl::Kernel k("reduce", ocl::core::reduce2_oclsrc, build_opt);
if (k.empty())
return false;
UMat src = _src.getUMat();
Size dsize(dim == 0 ? src.cols : 1, dim == 0 ? 1 : src.rows);
_dst.create(dsize, dtype);
UMat dst = _dst.getUMat();
ocl::KernelArg srcarg = ocl::KernelArg::ReadOnly(src),
temparg = ocl::KernelArg::WriteOnlyNoSize(dst);
if (op0 == CV_REDUCE_AVG)
kmain.args(ocl::KernelArg::ReadOnly(buf),
ocl::KernelArg::WriteOnlyNoSize(dst), 1.0f / src.cols);
k.args(srcarg, temparg, 1.0f / (dim == 0 ? src.rows : src.cols));
else
kmain.args(ocl::KernelArg::ReadOnly(buf),
ocl::KernelArg::WriteOnlyNoSize(dst));
k.args(srcarg, temparg);
globalSize[0] = src.rows;
return kmain.run(1, globalSize, NULL, false);
size_t globalsize = std::max(dsize.width, dsize.height);
return k.run(1, &globalsize, NULL, false);
}
ocl::Kernel k("reduce", ocl::core::reduce2_oclsrc, build_opt);
if (k.empty())
return false;
UMat src = _src.getUMat();
Size dsize(dim == 0 ? src.cols : 1, dim == 0 ? 1 : src.rows);
_dst.create(dsize, dtype);
UMat dst = _dst.getUMat();
ocl::KernelArg srcarg = ocl::KernelArg::ReadOnly(src),
temparg = ocl::KernelArg::WriteOnlyNoSize(dst);
if (op0 == CV_REDUCE_AVG)
k.args(srcarg, temparg, 1.0f / (dim == 0 ? src.rows : src.cols));
else
k.args(srcarg, temparg);
size_t globalsize = std::max(dsize.width, dsize.height);
return k.run(1, &globalsize, NULL, false);
}
}

View File

@@ -3494,9 +3494,8 @@ public:
OpenCLBufferPoolImpl()
: currentReservedSize(0), maxReservedSize(0)
{
// Note: Buffer pool is disabled by default,
// because we didn't receive significant performance improvement
maxReservedSize = getConfigurationParameterForSize("OPENCV_OPENCL_BUFFERPOOL_LIMIT", 0);
int poolSize = ocl::Device::getDefault().isIntel() ? 1 << 27 : 0;
maxReservedSize = getConfigurationParameterForSize("OPENCV_OPENCL_BUFFERPOOL_LIMIT", poolSize);
}
virtual ~OpenCLBufferPoolImpl()
{
@@ -3739,6 +3738,7 @@ public:
u->handle = clCreateBuffer(ctx_handle, CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE|createFlags,
u->size, u->origdata, &retval);
tempUMatFlags = UMatData::TEMP_COPIED_UMAT;
}
if(!u->handle || retval != CL_SUCCESS)
return false;
@@ -3880,6 +3880,7 @@ public:
if(u->data && retval == CL_SUCCESS)
{
u->markHostCopyObsolete(false);
u->markDeviceMemMapped(true);
return;
}
@@ -3908,6 +3909,7 @@ public:
if(!u)
return;
CV_Assert(u->handle != 0);
UMatDataAutoLock autolock(u);
@@ -3918,8 +3920,10 @@ public:
cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
cl_int retval = 0;
if( !u->copyOnMap() && u->data )
if( !u->copyOnMap() && u->deviceMemMapped() )
{
CV_Assert(u->data != NULL);
u->markDeviceMemMapped(false);
CV_Assert( (retval = clEnqueueUnmapMemObject(q,
(cl_mem)u->handle, u->data, 0, 0, 0)) == CL_SUCCESS );
CV_OclDbgAssert(clFinish(q) == CL_SUCCESS);

View File

@@ -0,0 +1,864 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
// Copyright (C) 2014, Itseez, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
#define SQRT_2 0.707106781188f
#define sin_120 0.866025403784f
#define fft5_2 0.559016994374f
#define fft5_3 -0.951056516295f
#define fft5_4 -1.538841768587f
#define fft5_5 0.363271264002f
__attribute__((always_inline))
float2 mul_float2(float2 a, float2 b) {
return (float2)(fma(a.x, b.x, -a.y * b.y), fma(a.x, b.y, a.y * b.x));
}
__attribute__((always_inline))
float2 twiddle(float2 a) {
return (float2)(a.y, -a.x);
}
__attribute__((always_inline))
void butterfly2(float2 a0, float2 a1, __local float2* smem, __global const float2* twiddles,
const int x, const int block_size)
{
const int k = x & (block_size - 1);
a1 = mul_float2(twiddles[k], a1);
const int dst_ind = (x << 1) - k;
smem[dst_ind] = a0 + a1;
smem[dst_ind+block_size] = a0 - a1;
}
__attribute__((always_inline))
void butterfly4(float2 a0, float2 a1, float2 a2, float2 a3, __local float2* smem, __global const float2* twiddles,
const int x, const int block_size)
{
const int k = x & (block_size - 1);
a1 = mul_float2(twiddles[k], a1);
a2 = mul_float2(twiddles[k + block_size], a2);
a3 = mul_float2(twiddles[k + 2*block_size], a3);
const int dst_ind = ((x - k) << 2) + k;
float2 b0 = a0 + a2;
a2 = a0 - a2;
float2 b1 = a1 + a3;
a3 = twiddle(a1 - a3);
smem[dst_ind] = b0 + b1;
smem[dst_ind + block_size] = a2 + a3;
smem[dst_ind + 2*block_size] = b0 - b1;
smem[dst_ind + 3*block_size] = a2 - a3;
}
__attribute__((always_inline))
void butterfly3(float2 a0, float2 a1, float2 a2, __local float2* smem, __global const float2* twiddles,
const int x, const int block_size)
{
const int k = x % block_size;
a1 = mul_float2(twiddles[k], a1);
a2 = mul_float2(twiddles[k+block_size], a2);
const int dst_ind = ((x - k) * 3) + k;
float2 b1 = a1 + a2;
a2 = twiddle(sin_120*(a1 - a2));
float2 b0 = a0 - (float2)(0.5f)*b1;
smem[dst_ind] = a0 + b1;
smem[dst_ind + block_size] = b0 + a2;
smem[dst_ind + 2*block_size] = b0 - a2;
}
__attribute__((always_inline))
void butterfly5(float2 a0, float2 a1, float2 a2, float2 a3, float2 a4, __local float2* smem, __global const float2* twiddles,
const int x, const int block_size)
{
const int k = x % block_size;
a1 = mul_float2(twiddles[k], a1);
a2 = mul_float2(twiddles[k + block_size], a2);
a3 = mul_float2(twiddles[k+2*block_size], a3);
a4 = mul_float2(twiddles[k+3*block_size], a4);
const int dst_ind = ((x - k) * 5) + k;
__local float2* dst = smem + dst_ind;
float2 b0, b1, b5;
b1 = a1 + a4;
a1 -= a4;
a4 = a3 + a2;
a3 -= a2;
a2 = b1 + a4;
b0 = a0 - (float2)0.25f * a2;
b1 = fft5_2 * (b1 - a4);
a4 = fft5_3 * (float2)(-a1.y - a3.y, a1.x + a3.x);
b5 = (float2)(a4.x - fft5_5 * a1.y, a4.y + fft5_5 * a1.x);
a4.x += fft5_4 * a3.y;
a4.y -= fft5_4 * a3.x;
a1 = b0 + b1;
b0 -= b1;
dst[0] = a0 + a2;
dst[block_size] = a1 + a4;
dst[2 * block_size] = b0 + b5;
dst[3 * block_size] = b0 - b5;
dst[4 * block_size] = a1 - a4;
}
__attribute__((always_inline))
void fft_radix2(__local float2* smem, __global const float2* twiddles, const int x, const int block_size, const int t)
{
float2 a0, a1;
if (x < t)
{
a0 = smem[x];
a1 = smem[x+t];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (x < t)
butterfly2(a0, a1, smem, twiddles, x, block_size);
barrier(CLK_LOCAL_MEM_FENCE);
}
__attribute__((always_inline))
void fft_radix2_B2(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t)
{
const int x2 = x1 + t/2;
float2 a0, a1, a2, a3;
if (x1 < t/2)
{
a0 = smem[x1]; a1 = smem[x1+t];
a2 = smem[x2]; a3 = smem[x2+t];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (x1 < t/2)
{
butterfly2(a0, a1, smem, twiddles, x1, block_size);
butterfly2(a2, a3, smem, twiddles, x2, block_size);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
__attribute__((always_inline))
void fft_radix2_B3(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t)
{
const int x2 = x1 + t/3;
const int x3 = x1 + 2*t/3;
float2 a0, a1, a2, a3, a4, a5;
if (x1 < t/3)
{
a0 = smem[x1]; a1 = smem[x1+t];
a2 = smem[x2]; a3 = smem[x2+t];
a4 = smem[x3]; a5 = smem[x3+t];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (x1 < t/3)
{
butterfly2(a0, a1, smem, twiddles, x1, block_size);
butterfly2(a2, a3, smem, twiddles, x2, block_size);
butterfly2(a4, a5, smem, twiddles, x3, block_size);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
__attribute__((always_inline))
void fft_radix2_B4(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t)
{
const int thread_block = t/4;
const int x2 = x1 + thread_block;
const int x3 = x1 + 2*thread_block;
const int x4 = x1 + 3*thread_block;
float2 a0, a1, a2, a3, a4, a5, a6, a7;
if (x1 < t/4)
{
a0 = smem[x1]; a1 = smem[x1+t];
a2 = smem[x2]; a3 = smem[x2+t];
a4 = smem[x3]; a5 = smem[x3+t];
a6 = smem[x4]; a7 = smem[x4+t];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (x1 < t/4)
{
butterfly2(a0, a1, smem, twiddles, x1, block_size);
butterfly2(a2, a3, smem, twiddles, x2, block_size);
butterfly2(a4, a5, smem, twiddles, x3, block_size);
butterfly2(a6, a7, smem, twiddles, x4, block_size);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
__attribute__((always_inline))
void fft_radix2_B5(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t)
{
const int thread_block = t/5;
const int x2 = x1 + thread_block;
const int x3 = x1 + 2*thread_block;
const int x4 = x1 + 3*thread_block;
const int x5 = x1 + 4*thread_block;
float2 a0, a1, a2, a3, a4, a5, a6, a7, a8, a9;
if (x1 < t/5)
{
a0 = smem[x1]; a1 = smem[x1+t];
a2 = smem[x2]; a3 = smem[x2+t];
a4 = smem[x3]; a5 = smem[x3+t];
a6 = smem[x4]; a7 = smem[x4+t];
a8 = smem[x5]; a9 = smem[x5+t];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (x1 < t/5)
{
butterfly2(a0, a1, smem, twiddles, x1, block_size);
butterfly2(a2, a3, smem, twiddles, x2, block_size);
butterfly2(a4, a5, smem, twiddles, x3, block_size);
butterfly2(a6, a7, smem, twiddles, x4, block_size);
butterfly2(a8, a9, smem, twiddles, x5, block_size);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
__attribute__((always_inline))
void fft_radix4(__local float2* smem, __global const float2* twiddles, const int x, const int block_size, const int t)
{
float2 a0, a1, a2, a3;
if (x < t)
{
a0 = smem[x]; a1 = smem[x+t]; a2 = smem[x+2*t]; a3 = smem[x+3*t];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (x < t)
butterfly4(a0, a1, a2, a3, smem, twiddles, x, block_size);
barrier(CLK_LOCAL_MEM_FENCE);
}
__attribute__((always_inline))
void fft_radix4_B2(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t)
{
const int x2 = x1 + t/2;
float2 a0, a1, a2, a3, a4, a5, a6, a7;
if (x1 < t/2)
{
a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t]; a3 = smem[x1+3*t];
a4 = smem[x2]; a5 = smem[x2+t]; a6 = smem[x2+2*t]; a7 = smem[x2+3*t];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (x1 < t/2)
{
butterfly4(a0, a1, a2, a3, smem, twiddles, x1, block_size);
butterfly4(a4, a5, a6, a7, smem, twiddles, x2, block_size);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
__attribute__((always_inline))
void fft_radix4_B3(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t)
{
const int x2 = x1 + t/3;
const int x3 = x2 + t/3;
float2 a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11;
if (x1 < t/3)
{
a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t]; a3 = smem[x1+3*t];
a4 = smem[x2]; a5 = smem[x2+t]; a6 = smem[x2+2*t]; a7 = smem[x2+3*t];
a8 = smem[x3]; a9 = smem[x3+t]; a10 = smem[x3+2*t]; a11 = smem[x3+3*t];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (x1 < t/3)
{
butterfly4(a0, a1, a2, a3, smem, twiddles, x1, block_size);
butterfly4(a4, a5, a6, a7, smem, twiddles, x2, block_size);
butterfly4(a8, a9, a10, a11, smem, twiddles, x3, block_size);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
__attribute__((always_inline))
void fft_radix8(__local float2* smem, __global const float2* twiddles, const int x, const int block_size, const int t)
{
const int k = x % block_size;
float2 a0, a1, a2, a3, a4, a5, a6, a7;
if (x < t)
{
int tw_ind = block_size / 8;
a0 = smem[x];
a1 = mul_float2(twiddles[k], smem[x + t]);
a2 = mul_float2(twiddles[k + block_size],smem[x+2*t]);
a3 = mul_float2(twiddles[k+2*block_size],smem[x+3*t]);
a4 = mul_float2(twiddles[k+3*block_size],smem[x+4*t]);
a5 = mul_float2(twiddles[k+4*block_size],smem[x+5*t]);
a6 = mul_float2(twiddles[k+5*block_size],smem[x+6*t]);
a7 = mul_float2(twiddles[k+6*block_size],smem[x+7*t]);
float2 b0, b1, b6, b7;
b0 = a0 + a4;
a4 = a0 - a4;
b1 = a1 + a5;
a5 = a1 - a5;
a5 = (float2)(SQRT_2) * (float2)(a5.x + a5.y, -a5.x + a5.y);
b6 = twiddle(a2 - a6);
a2 = a2 + a6;
b7 = a3 - a7;
b7 = (float2)(SQRT_2) * (float2)(-b7.x + b7.y, -b7.x - b7.y);
a3 = a3 + a7;
a0 = b0 + a2;
a2 = b0 - a2;
a1 = b1 + a3;
a3 = twiddle(b1 - a3);
a6 = a4 - b6;
a4 = a4 + b6;
a7 = twiddle(a5 - b7);
a5 = a5 + b7;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (x < t)
{
const int dst_ind = ((x - k) << 3) + k;
__local float2* dst = smem + dst_ind;
dst[0] = a0 + a1;
dst[block_size] = a4 + a5;
dst[2 * block_size] = a2 + a3;
dst[3 * block_size] = a6 + a7;
dst[4 * block_size] = a0 - a1;
dst[5 * block_size] = a4 - a5;
dst[6 * block_size] = a2 - a3;
dst[7 * block_size] = a6 - a7;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
__attribute__((always_inline))
void fft_radix3(__local float2* smem, __global const float2* twiddles, const int x, const int block_size, const int t)
{
float2 a0, a1, a2;
if (x < t)
{
a0 = smem[x]; a1 = smem[x+t]; a2 = smem[x+2*t];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (x < t)
butterfly3(a0, a1, a2, smem, twiddles, x, block_size);
barrier(CLK_LOCAL_MEM_FENCE);
}
__attribute__((always_inline))
void fft_radix3_B2(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t)
{
const int x2 = x1 + t/2;
float2 a0, a1, a2, a3, a4, a5;
if (x1 < t/2)
{
a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t];
a3 = smem[x2]; a4 = smem[x2+t]; a5 = smem[x2+2*t];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (x1 < t/2)
{
butterfly3(a0, a1, a2, smem, twiddles, x1, block_size);
butterfly3(a3, a4, a5, smem, twiddles, x2, block_size);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
__attribute__((always_inline))
void fft_radix3_B3(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t)
{
const int x2 = x1 + t/3;
const int x3 = x2 + t/3;
float2 a0, a1, a2, a3, a4, a5, a6, a7, a8;
if (x1 < t/2)
{
a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t];
a3 = smem[x2]; a4 = smem[x2+t]; a5 = smem[x2+2*t];
a6 = smem[x3]; a7 = smem[x3+t]; a8 = smem[x3+2*t];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (x1 < t/2)
{
butterfly3(a0, a1, a2, smem, twiddles, x1, block_size);
butterfly3(a3, a4, a5, smem, twiddles, x2, block_size);
butterfly3(a6, a7, a8, smem, twiddles, x3, block_size);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
__attribute__((always_inline))
void fft_radix3_B4(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t)
{
const int thread_block = t/4;
const int x2 = x1 + thread_block;
const int x3 = x1 + 2*thread_block;
const int x4 = x1 + 3*thread_block;
float2 a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11;
if (x1 < t/4)
{
a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t];
a3 = smem[x2]; a4 = smem[x2+t]; a5 = smem[x2+2*t];
a6 = smem[x3]; a7 = smem[x3+t]; a8 = smem[x3+2*t];
a9 = smem[x4]; a10 = smem[x4+t]; a11 = smem[x4+2*t];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (x1 < t/4)
{
butterfly3(a0, a1, a2, smem, twiddles, x1, block_size);
butterfly3(a3, a4, a5, smem, twiddles, x2, block_size);
butterfly3(a6, a7, a8, smem, twiddles, x3, block_size);
butterfly3(a9, a10, a11, smem, twiddles, x4, block_size);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
__attribute__((always_inline))
void fft_radix5(__local float2* smem, __global const float2* twiddles, const int x, const int block_size, const int t)
{
const int k = x % block_size;
float2 a0, a1, a2, a3, a4;
if (x < t)
{
a0 = smem[x]; a1 = smem[x + t]; a2 = smem[x+2*t]; a3 = smem[x+3*t]; a4 = smem[x+4*t];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (x < t)
butterfly5(a0, a1, a2, a3, a4, smem, twiddles, x, block_size);
barrier(CLK_LOCAL_MEM_FENCE);
}
__attribute__((always_inline))
void fft_radix5_B2(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t)
{
const int x2 = x1+t/2;
float2 a0, a1, a2, a3, a4, a5, a6, a7, a8, a9;
if (x1 < t/2)
{
a0 = smem[x1]; a1 = smem[x1 + t]; a2 = smem[x1+2*t]; a3 = smem[x1+3*t]; a4 = smem[x1+4*t];
a5 = smem[x2]; a6 = smem[x2 + t]; a7 = smem[x2+2*t]; a8 = smem[x2+3*t]; a9 = smem[x2+4*t];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (x1 < t/2)
{
butterfly5(a0, a1, a2, a3, a4, smem, twiddles, x1, block_size);
butterfly5(a5, a6, a7, a8, a9, smem, twiddles, x2, block_size);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
#ifdef DFT_SCALE
#define SCALE_VAL(x, scale) x*scale
#else
#define SCALE_VAL(x, scale) x
#endif
__kernel void fft_multi_radix_rows(__global const uchar* src_ptr, int src_step, int src_offset, int src_rows, int src_cols,
__global uchar* dst_ptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
__global float2* twiddles_ptr, const int t, const int nz)
{
const int x = get_global_id(0);
const int y = get_group_id(1);
const int block_size = LOCAL_SIZE/kercn;
if (y < nz)
{
__local float2 smem[LOCAL_SIZE];
__global const float2* twiddles = (__global float2*) twiddles_ptr;
const int ind = x;
#ifdef IS_1D
float scale = 1.f/dst_cols;
#else
float scale = 1.f/(dst_cols*dst_rows);
#endif
#ifdef COMPLEX_INPUT
__global const float2* src = (__global const float2*)(src_ptr + mad24(y, src_step, mad24(x, (int)(sizeof(float)*2), src_offset)));
#pragma unroll
for (int i=0; i<kercn; i++)
smem[x+i*block_size] = src[i*block_size];
#else
__global const float* src = (__global const float*)(src_ptr + mad24(y, src_step, mad24(x, (int)sizeof(float), src_offset)));
#pragma unroll
for (int i=0; i<kercn; i++)
smem[x+i*block_size] = (float2)(src[i*block_size], 0.f);
#endif
barrier(CLK_LOCAL_MEM_FENCE);
RADIX_PROCESS;
#ifdef COMPLEX_OUTPUT
#ifdef NO_CONJUGATE
// copy result without complex conjugate
const int cols = dst_cols/2 + 1;
#else
const int cols = dst_cols;
#endif
__global float2* dst = (__global float2*)(dst_ptr + mad24(y, dst_step, dst_offset));
#pragma unroll
for (int i=x; i<cols; i+=block_size)
dst[i] = SCALE_VAL(smem[i], scale);
#else
// pack row to CCS
__local float* smem_1cn = (__local float*) smem;
__global float* dst = (__global float*)(dst_ptr + mad24(y, dst_step, dst_offset));
for (int i=x; i<dst_cols-1; i+=block_size)
dst[i+1] = SCALE_VAL(smem_1cn[i+2], scale);
if (x == 0)
dst[0] = SCALE_VAL(smem_1cn[0], scale);
#endif
}
else
{
// fill with zero other rows
#ifdef COMPLEX_OUTPUT
__global float2* dst = (__global float2*)(dst_ptr + mad24(y, dst_step, dst_offset));
#else
__global float* dst = (__global float*)(dst_ptr + mad24(y, dst_step, dst_offset));
#endif
#pragma unroll
for (int i=x; i<dst_cols; i+=block_size)
dst[i] = 0.f;
}
}
__kernel void fft_multi_radix_cols(__global const uchar* src_ptr, int src_step, int src_offset, int src_rows, int src_cols,
__global uchar* dst_ptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
__global float2* twiddles_ptr, const int t, const int nz)
{
const int x = get_group_id(0);
const int y = get_global_id(1);
if (x < nz)
{
__local float2 smem[LOCAL_SIZE];
__global const uchar* src = src_ptr + mad24(y, src_step, mad24(x, (int)(sizeof(float)*2), src_offset));
__global const float2* twiddles = (__global float2*) twiddles_ptr;
const int ind = y;
const int block_size = LOCAL_SIZE/kercn;
float scale = 1.f/(dst_rows*dst_cols);
#pragma unroll
for (int i=0; i<kercn; i++)
smem[y+i*block_size] = *((__global const float2*)(src + i*block_size*src_step));
barrier(CLK_LOCAL_MEM_FENCE);
RADIX_PROCESS;
#ifdef COMPLEX_OUTPUT
__global uchar* dst = dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(float)*2), dst_offset));
#pragma unroll
for (int i=0; i<kercn; i++)
*((__global float2*)(dst + i*block_size*dst_step)) = SCALE_VAL(smem[y + i*block_size], scale);
#else
if (x == 0)
{
// pack first column to CCS
__local float* smem_1cn = (__local float*) smem;
__global uchar* dst = dst_ptr + mad24(y+1, dst_step, dst_offset);
for (int i=y; i<dst_rows-1; i+=block_size, dst+=dst_step*block_size)
*((__global float*) dst) = SCALE_VAL(smem_1cn[i+2], scale);
if (y == 0)
*((__global float*) (dst_ptr + dst_offset)) = SCALE_VAL(smem_1cn[0], scale);
}
else if (x == (dst_cols+1)/2)
{
// pack last column to CCS (if needed)
__local float* smem_1cn = (__local float*) smem;
__global uchar* dst = dst_ptr + mad24(dst_cols-1, (int)sizeof(float), mad24(y+1, dst_step, dst_offset));
for (int i=y; i<dst_rows-1; i+=block_size, dst+=dst_step*block_size)
*((__global float*) dst) = SCALE_VAL(smem_1cn[i+2], scale);
if (y == 0)
*((__global float*) (dst_ptr + mad24(dst_cols-1, (int)sizeof(float), dst_offset))) = SCALE_VAL(smem_1cn[0], scale);
}
else
{
__global uchar* dst = dst_ptr + mad24(x, (int)sizeof(float)*2, mad24(y, dst_step, dst_offset - (int)sizeof(float)));
#pragma unroll
for (int i=y; i<dst_rows; i+=block_size, dst+=block_size*dst_step)
vstore2(SCALE_VAL(smem[i], scale), 0, (__global float*) dst);
}
#endif
}
}
__kernel void ifft_multi_radix_rows(__global const uchar* src_ptr, int src_step, int src_offset, int src_rows, int src_cols,
__global uchar* dst_ptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
__global float2* twiddles_ptr, const int t, const int nz)
{
const int x = get_global_id(0);
const int y = get_group_id(1);
const int block_size = LOCAL_SIZE/kercn;
#ifdef IS_1D
const float scale = 1.f/dst_cols;
#else
const float scale = 1.f/(dst_cols*dst_rows);
#endif
if (y < nz)
{
__local float2 smem[LOCAL_SIZE];
__global const float2* twiddles = (__global float2*) twiddles_ptr;
const int ind = x;
#if defined(COMPLEX_INPUT) && !defined(NO_CONJUGATE)
__global const float2* src = (__global const float2*)(src_ptr + mad24(y, src_step, mad24(x, (int)(sizeof(float)*2), src_offset)));
#pragma unroll
for (int i=0; i<kercn; i++)
{
smem[x+i*block_size].x = src[i*block_size].x;
smem[x+i*block_size].y = -src[i*block_size].y;
}
#else
#if !defined(REAL_INPUT) && defined(NO_CONJUGATE)
__global const float2* src = (__global const float2*)(src_ptr + mad24(y, src_step, mad24(2, (int)sizeof(float), src_offset)));
#pragma unroll
for (int i=x; i<(LOCAL_SIZE-1)/2; i+=block_size)
{
smem[i+1].x = src[i].x;
smem[i+1].y = -src[i].y;
smem[LOCAL_SIZE-i-1] = src[i];
}
#else
#pragma unroll
for (int i=x; i<(LOCAL_SIZE-1)/2; i+=block_size)
{
float2 src = vload2(0, (__global const float*)(src_ptr + mad24(y, src_step, mad24(2*i+1, (int)sizeof(float), src_offset))));
smem[i+1].x = src.x;
smem[i+1].y = -src.y;
smem[LOCAL_SIZE-i-1] = src;
}
#endif
if (x==0)
{
smem[0].x = *(__global const float*)(src_ptr + mad24(y, src_step, src_offset));
smem[0].y = 0.f;
if(LOCAL_SIZE % 2 ==0)
{
#if !defined(REAL_INPUT) && defined(NO_CONJUGATE)
smem[LOCAL_SIZE/2].x = src[LOCAL_SIZE/2-1].x;
#else
smem[LOCAL_SIZE/2].x = *(__global const float*)(src_ptr + mad24(y, src_step, mad24(LOCAL_SIZE-1, (int)sizeof(float), src_offset)));
#endif
smem[LOCAL_SIZE/2].y = 0.f;
}
}
#endif
barrier(CLK_LOCAL_MEM_FENCE);
RADIX_PROCESS;
// copy data to dst
#ifdef COMPLEX_OUTPUT
__global float2* dst = (__global float*)(dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(float)*2), dst_offset)));
#pragma unroll
for (int i=0; i<kercn; i++)
{
dst[i*block_size].x = SCALE_VAL(smem[x + i*block_size].x, scale);
dst[i*block_size].y = SCALE_VAL(-smem[x + i*block_size].y, scale);
}
#else
__global float* dst = (__global float*)(dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(float)), dst_offset)));
#pragma unroll
for (int i=0; i<kercn; i++)
{
dst[i*block_size] = SCALE_VAL(smem[x + i*block_size].x, scale);
}
#endif
}
else
{
// fill with zero other rows
#ifdef COMPLEX_OUTPUT
__global float2* dst = (__global float2*)(dst_ptr + mad24(y, dst_step, dst_offset));
#else
__global float* dst = (__global float*)(dst_ptr + mad24(y, dst_step, dst_offset));
#endif
#pragma unroll
for (int i=x; i<dst_cols; i+=block_size)
dst[i] = 0.f;
}
}
__kernel void ifft_multi_radix_cols(__global const uchar* src_ptr, int src_step, int src_offset, int src_rows, int src_cols,
__global uchar* dst_ptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
__global float2* twiddles_ptr, const int t, const int nz)
{
const int x = get_group_id(0);
const int y = get_global_id(1);
#ifdef COMPLEX_INPUT
if (x < nz)
{
__local float2 smem[LOCAL_SIZE];
__global const uchar* src = src_ptr + mad24(y, src_step, mad24(x, (int)(sizeof(float)*2), src_offset));
__global uchar* dst = dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(float)*2), dst_offset));
__global const float2* twiddles = (__global float2*) twiddles_ptr;
const int ind = y;
const int block_size = LOCAL_SIZE/kercn;
#pragma unroll
for (int i=0; i<kercn; i++)
{
float2 temp = *((__global const float2*)(src + i*block_size*src_step));
smem[y+i*block_size].x = temp.x;
smem[y+i*block_size].y = -temp.y;
}
barrier(CLK_LOCAL_MEM_FENCE);
RADIX_PROCESS;
// copy data to dst
#pragma unroll
for (int i=0; i<kercn; i++)
{
__global float2* res = (__global float2*)(dst + i*block_size*dst_step);
res[0].x = smem[y + i*block_size].x;
res[0].y = -smem[y + i*block_size].y;
}
}
#else
if (x < nz)
{
__global const float2* twiddles = (__global float2*) twiddles_ptr;
const int ind = y;
const int block_size = LOCAL_SIZE/kercn;
__local float2 smem[LOCAL_SIZE];
#ifdef EVEN
if (x!=0 && (x!=(nz-1)))
#else
if (x!=0)
#endif
{
__global const uchar* src = src_ptr + mad24(y, src_step, mad24(2*x-1, (int)sizeof(float), src_offset));
#pragma unroll
for (int i=0; i<kercn; i++)
{
float2 temp = vload2(0, (__global const float*)(src + i*block_size*src_step));
smem[y+i*block_size].x = temp.x;
smem[y+i*block_size].y = -temp.y;
}
}
else
{
int ind = x==0 ? 0: 2*x-1;
__global const float* src = (__global const float*)(src_ptr + mad24(1, src_step, mad24(ind, (int)sizeof(float), src_offset)));
int step = src_step/(int)sizeof(float);
#pragma unroll
for (int i=y; i<(LOCAL_SIZE-1)/2; i+=block_size)
{
smem[i+1].x = src[2*i*step];
smem[i+1].y = -src[(2*i+1)*step];
smem[LOCAL_SIZE-i-1].x = src[2*i*step];;
smem[LOCAL_SIZE-i-1].y = src[(2*i+1)*step];
}
if (y==0)
{
smem[0].x = *(__global const float*)(src_ptr + mad24(ind, (int)sizeof(float), src_offset));
smem[0].y = 0.f;
if(LOCAL_SIZE % 2 ==0)
{
smem[LOCAL_SIZE/2].x = src[(LOCAL_SIZE-2)*step];
smem[LOCAL_SIZE/2].y = 0.f;
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
RADIX_PROCESS;
// copy data to dst
__global uchar* dst = dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(float2)), dst_offset));
#pragma unroll
for (int i=0; i<kercn; i++)
{
__global float2* res = (__global float2*)(dst + i*block_size*dst_step);
res[0].x = smem[y + i*block_size].x;
res[0].y = -smem[y + i*block_size].y;
}
}
#endif
}

View File

@@ -36,114 +36,118 @@
#if lcn == 1
#if dcn == 4
#define LUT_OP(num)\
int idx = *(__global const int *)(srcptr + mad24(num, src_step, src_index));\
dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
dst[0] = lut_l[idx & 0xff];\
dst[1] = lut_l[(idx >> 8) & 0xff];\
dst[2] = lut_l[(idx >> 16) & 0xff];\
#define LUT_OP \
int idx = *(__global const int *)(srcptr + src_index); \
dst = (__global dstT *)(dstptr + dst_index); \
dst[0] = lut_l[idx & 0xff]; \
dst[1] = lut_l[(idx >> 8) & 0xff]; \
dst[2] = lut_l[(idx >> 16) & 0xff]; \
dst[3] = lut_l[(idx >> 24) & 0xff];
#elif dcn == 3
#define LUT_OP(num)\
uchar3 idx = vload3(0, srcptr + mad24(num, src_step, src_index));\
dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
dst[0] = lut_l[idx.x];\
dst[1] = lut_l[idx.y];\
#define LUT_OP \
uchar3 idx = vload3(0, srcptr + src_index); \
dst = (__global dstT *)(dstptr + dst_index); \
dst[0] = lut_l[idx.x]; \
dst[1] = lut_l[idx.y]; \
dst[2] = lut_l[idx.z];
#elif dcn == 2
#define LUT_OP(num)\
short idx = *(__global const short *)(srcptr + mad24(num, src_step, src_index));\
dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
dst[0] = lut_l[idx & 0xff];\
#define LUT_OP \
short idx = *(__global const short *)(srcptr + src_index); \
dst = (__global dstT *)(dstptr + dst_index); \
dst[0] = lut_l[idx & 0xff]; \
dst[1] = lut_l[(idx >> 8) & 0xff];
#elif dcn == 1
#define LUT_OP(num)\
uchar idx = (srcptr + mad24(num, src_step, src_index))[0];\
dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
#define LUT_OP \
uchar idx = (srcptr + src_index)[0]; \
dst = (__global dstT *)(dstptr + dst_index); \
dst[0] = lut_l[idx];
#else
#define LUT_OP(num)\
__global const srcT * src = (__global const srcT *)(srcptr + mad24(num, src_step, src_index));\
dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
for (int cn = 0; cn < dcn; ++cn)\
#define LUT_OP \
__global const srcT * src = (__global const srcT *)(srcptr + src_index); \
dst = (__global dstT *)(dstptr + dst_index); \
for (int cn = 0; cn < dcn; ++cn) \
dst[cn] = lut_l[src[cn]];
#endif
#else
#if dcn == 4
#define LUT_OP(num)\
__global const uchar4 *src_pixel = (__global const uchar4 *)(srcptr + mad24(num, src_step, src_index));\
int4 idx = convert_int4(src_pixel[0]) * lcn + (int4)(0, 1, 2, 3);\
dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
dst[0] = lut_l[idx.x];\
dst[1] = lut_l[idx.y];\
dst[2] = lut_l[idx.z];\
#define LUT_OP \
__global const uchar4 * src_pixel = (__global const uchar4 *)(srcptr + src_index); \
int4 idx = mad24(convert_int4(src_pixel[0]), (int4)(lcn), (int4)(0, 1, 2, 3)); \
dst = (__global dstT *)(dstptr + dst_index); \
dst[0] = lut_l[idx.x]; \
dst[1] = lut_l[idx.y]; \
dst[2] = lut_l[idx.z]; \
dst[3] = lut_l[idx.w];
#elif dcn == 3
#define LUT_OP(num)\
uchar3 src_pixel = vload3(0, srcptr + mad24(num, src_step, src_index));\
int3 idx = convert_int3(src_pixel) * lcn + (int3)(0, 1, 2);\
dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
dst[0] = lut_l[idx.x];\
dst[1] = lut_l[idx.y];\
#define LUT_OP \
uchar3 src_pixel = vload3(0, srcptr + src_index); \
int3 idx = mad24(convert_int3(src_pixel), (int3)(lcn), (int3)(0, 1, 2)); \
dst = (__global dstT *)(dstptr + dst_index); \
dst[0] = lut_l[idx.x]; \
dst[1] = lut_l[idx.y]; \
dst[2] = lut_l[idx.z];
#elif dcn == 2
#define LUT_OP(num)\
__global const uchar2 *src_pixel = (__global const uchar2 *)(srcptr + mad24(num, src_step, src_index));\
int2 idx = convert_int2(src_pixel[0]) * lcn + (int2)(0, 1);\
dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
dst[0] = lut_l[idx.x];\
#define LUT_OP \
__global const uchar2 * src_pixel = (__global const uchar2 *)(srcptr + src_index); \
int2 idx = mad24(convert_int2(src_pixel[0]), lcn, (int2)(0, 1)); \
dst = (__global dstT *)(dstptr + dst_index); \
dst[0] = lut_l[idx.x]; \
dst[1] = lut_l[idx.y];
#elif dcn == 1 //error case (1 < lcn) ==> lcn == scn == dcn
#define LUT_OP(num)\
uchar idx = (srcptr + mad24(num, src_step, src_index))[0];\
dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
#define LUT_OP \
uchar idx = (srcptr + src_index)[0]; \
dst = (__global dstT *)(dstptr + dst_index); \
dst[0] = lut_l[idx];
#else
#define LUT_OP(num)\
__global const srcT *src = (__global const srcT *)(srcptr + mad24(num, src_step, src_index));\
dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
for (int cn = 0; cn < dcn; ++cn)\
#define LUT_OP \
__global const srcT * src = (__global const srcT *)(srcptr + src_index); \
dst = (__global dstT *)(dstptr + dst_index); \
for (int cn = 0; cn < dcn; ++cn) \
dst[cn] = lut_l[mad24(src[cn], lcn, cn)];
#endif
#endif
#define LOCAL_LUT_INIT\
{\
__global const dstT * lut = (__global const dstT *)(lutptr + lut_offset);\
int init = mad24((int)get_local_id(1), (int)get_local_size(0), (int)get_local_id(0));\
int step = get_local_size(0) * get_local_size(1);\
for (int i = init; i < 256 * lcn; i += step)\
{\
lut_l[i] = lut[i];\
}\
barrier(CLK_LOCAL_MEM_FENCE);\
}
__kernel void LUT(__global const uchar * srcptr, int src_step, int src_offset,
__global const uchar * lutptr, int lut_step, int lut_offset,
__global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols)
{
__local dstT lut_l[256 * lcn];
LOCAL_LUT_INIT;
int x = get_global_id(0);
int y = 4 * get_global_id(1);
int y = get_global_id(1) << 2;
__local dstT lut_l[256 * lcn];
__global const dstT * lut = (__global const dstT *)(lutptr + lut_offset);
for (int i = mad24((int)get_local_id(1), (int)get_local_size(0), (int)get_local_id(0)),
step = get_local_size(0) * get_local_size(1); i < 256 * lcn; i += step)
lut_l[i] = lut[i];
barrier(CLK_LOCAL_MEM_FENCE);
if (x < cols && y < rows)
{
int src_index = mad24(y, src_step, mad24(x, (int)sizeof(srcT) * dcn, src_offset));
int dst_index = mad24(y, dst_step, mad24(x, (int)sizeof(dstT) * dcn, dst_offset));
__global dstT * dst;
LUT_OP(0);
LUT_OP;
if (y < rows - 1)
{
LUT_OP(1);
src_index += src_step;
dst_index += dst_step;
LUT_OP;
if (y < rows - 2)
{
LUT_OP(2);
src_index += src_step;
dst_index += dst_step;
LUT_OP;
if (y < rows - 3)
{
LUT_OP(3);
src_index += src_step;
dst_index += dst_step;
LUT_OP;
}
}
}

View File

@@ -42,9 +42,13 @@
#if wdepth <= 4
#define MIN_ABS(a) convertFromU(abs(a))
#define MIN_ABS2(a, b) convertFromU(abs_diff(a, b))
#define MIN(a, b) min(a, b)
#define MAX(a, b) max(a, b)
#else
#define MIN_ABS(a) fabs(a)
#define MIN_ABS2(a, b) fabs(a - b)
#define MIN(a, b) fmin(a, b)
#define MAX(a, b) fmax(a, b)
#endif
#if kercn != 3
@@ -60,44 +64,41 @@
#define srcTSIZE (int)sizeof(srcT1)
#endif
#ifdef NEED_MINLOC
#define CALC_MINLOC(inc) minloc = id + inc
#else
#define CALC_MINLOC(inc)
#endif
#ifdef NEED_MAXLOC
#define CALC_MAXLOC(inc) maxloc = id + inc
#else
#define CALC_MAXLOC(inc)
#endif
#ifdef NEED_MINVAL
#ifdef NEED_MINLOC
#define CALC_MIN(p, inc) \
if (minval > temp.p) \
{ \
minval = temp.p; \
CALC_MINLOC(inc); \
minloc = id + inc; \
}
#else
#define CALC_MIN(p, inc) \
minval = MIN(minval, temp.p);
#endif
#else
#define CALC_MIN(p, inc)
#endif
#ifdef NEED_MAXVAL
#ifdef NEED_MAXLOC
#define CALC_MAX(p, inc) \
if (maxval < temp.p) \
{ \
maxval = temp.p; \
CALC_MAXLOC(inc); \
maxloc = id + inc; \
}
#else
#define CALC_MAX(p, inc) \
maxval = MAX(maxval, temp.p);
#endif
#else
#define CALC_MAX(p, inc)
#endif
#ifdef OP_CALC2
#define CALC_MAX2(p) \
if (maxval2 < temp.p) \
maxval2 = temp.p;
maxval2 = MAX(maxval2, temp.p);
#else
#define CALC_MAX2(p)
#endif
@@ -208,25 +209,28 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
#if kercn == 1
#ifdef NEED_MINVAL
#ifdef NEED_MINLOC
if (minval > temp)
{
minval = temp;
#ifdef NEED_MINLOC
minloc = id;
#endif
}
#else
minval = MIN(minval, temp);
#endif
#endif
#ifdef NEED_MAXVAL
#ifdef NEED_MAXLOC
if (maxval < temp)
{
maxval = temp;
#ifdef NEED_MAXLOC
maxloc = id;
#endif
}
#else
maxval = MAX(maxval, temp);
#endif
#ifdef OP_CALC2
if (maxval2 < temp2)
maxval2 = temp2;
maxval2 = MAX(maxval2, temp2);
#endif
#endif
#elif kercn >= 2
@@ -282,32 +286,35 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
{
int lid3 = lid - WGS2_ALIGNED;
#ifdef NEED_MINVAL
#ifdef NEED_MINLOC
if (localmem_min[lid3] >= minval)
{
#ifdef NEED_MINLOC
if (localmem_min[lid3] == minval)
localmem_minloc[lid3] = min(localmem_minloc[lid3], minloc);
else
localmem_minloc[lid3] = minloc,
#endif
localmem_min[lid3] = minval;
localmem_min[lid3] = minval;
}
#else
localmem_min[lid3] = MIN(localmem_min[lid3], minval);
#endif
#endif
#ifdef NEED_MAXVAL
#ifdef NEED_MAXLOC
if (localmem_max[lid3] <= maxval)
{
#ifdef NEED_MAXLOC
if (localmem_max[lid3] == maxval)
localmem_maxloc[lid3] = min(localmem_maxloc[lid3], maxloc);
else
localmem_maxloc[lid3] = maxloc,
#endif
localmem_max[lid3] = maxval;
localmem_max[lid3] = maxval;
}
#else
localmem_max[lid3] = MAX(localmem_max[lid3], maxval);
#endif
#endif
#ifdef OP_CALC2
if (localmem_max2[lid3] < maxval2)
localmem_max2[lid3] = maxval2;
localmem_max2[lid3] = MAX(localmem_max2[lid3], maxval2);
#endif
}
barrier(CLK_LOCAL_MEM_FENCE);
@@ -319,32 +326,35 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
int lid2 = lsize + lid;
#ifdef NEED_MINVAL
#ifdef NEED_MINLOC
if (localmem_min[lid] >= localmem_min[lid2])
{
#ifdef NEED_MINLOC
if (localmem_min[lid] == localmem_min[lid2])
localmem_minloc[lid] = min(localmem_minloc[lid2], localmem_minloc[lid]);
else
localmem_minloc[lid] = localmem_minloc[lid2],
#endif
localmem_min[lid] = localmem_min[lid2];
localmem_min[lid] = localmem_min[lid2];
}
#else
localmem_min[lid] = MIN(localmem_min[lid], localmem_min[lid2]);
#endif
#endif
#ifdef NEED_MAXVAL
#ifdef NEED_MAXLOC
if (localmem_max[lid] <= localmem_max[lid2])
{
#ifdef NEED_MAXLOC
if (localmem_max[lid] == localmem_max[lid2])
localmem_maxloc[lid] = min(localmem_maxloc[lid2], localmem_maxloc[lid]);
else
localmem_maxloc[lid] = localmem_maxloc[lid2],
#endif
localmem_max[lid] = localmem_max[lid2];
localmem_max[lid] = localmem_max[lid2];
}
#else
localmem_max[lid] = MAX(localmem_max[lid], localmem_max[lid2]);
#endif
#endif
#ifdef OP_CALC2
if (localmem_max2[lid] < localmem_max2[lid2])
localmem_max2[lid] = localmem_max2[lid2];
localmem_max2[lid] = MAX(localmem_max2[lid], localmem_max2[lid2]);
#endif
}
barrier(CLK_LOCAL_MEM_FENCE);

View File

@@ -379,7 +379,7 @@
#define REDUCE_GLOBAL \
dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
temp = SUM_ABS2(temp, temp2)); \
temp = SUM_ABS2(temp, temp2); \
FUNC(accumulator, temp.s0); \
FUNC(accumulator, temp.s1); \
FUNC(accumulator, temp.s2); \

View File

@@ -81,29 +81,34 @@
#define PROCESS_ELEM(acc, value) acc += value
#elif defined OCL_CV_REDUCE_MAX
#define INIT_VALUE MIN_VAL
#define PROCESS_ELEM(acc, value) acc = value > acc ? value : acc
#define PROCESS_ELEM(acc, value) acc = max(value, acc)
#elif defined OCL_CV_REDUCE_MIN
#define INIT_VALUE MAX_VAL
#define PROCESS_ELEM(acc, value) acc = value < acc ? value : acc
#define PROCESS_ELEM(acc, value) acc = min(value, acc)
#else
#error "No operation is specified"
#endif
#ifdef OP_REDUCE_PRE
__kernel void reduce_horz_pre(__global const uchar * srcptr, int src_step, int src_offset, int rows, int cols,
__global uchar * bufptr, int buf_step, int buf_offset)
__kernel void reduce_horz_opt(__global const uchar * srcptr, int src_step, int src_offset, int rows, int cols,
__global uchar * dstptr, int dst_step, int dst_offset
#ifdef OCL_CV_REDUCE_AVG
, float fscale
#endif
)
{
__local bufT lsmem[TILE_HEIGHT][BUF_COLS][cn];
int x = get_global_id(0);
int y = get_global_id(1);
if (x < BUF_COLS)
int liy = get_local_id(1);
if ((x < BUF_COLS) && (y < rows))
{
int src_index = mad24(y, src_step, mad24(x, (int)sizeof(srcT) * cn, src_offset));
int buf_index = mad24(y, buf_step, mad24(x, (int)sizeof(dstT) * cn, buf_offset));
__global const srcT * src = (__global const srcT *)(srcptr + src_index);
__global dstT * buf = (__global dstT *)(bufptr + buf_index);
dstT tmp[cn] = { INIT_VALUE };
bufT tmp[cn] = { INIT_VALUE };
int src_step_mul = BUF_COLS * cn;
for (int idx = x; idx < cols; idx += BUF_COLS, src += src_step_mul)
@@ -111,14 +116,49 @@ __kernel void reduce_horz_pre(__global const uchar * srcptr, int src_step, int s
#pragma unroll
for (int c = 0; c < cn; ++c)
{
dstT value = convertToDT(src[c]);
bufT value = convertToBufT(src[c]);
PROCESS_ELEM(tmp[c], value);
}
}
#pragma unroll
for (int c = 0; c < cn; ++c)
buf[c] = tmp[c];
lsmem[liy][x][c] = tmp[c];
}
barrier(CLK_LOCAL_MEM_FENCE);
if ((x < BUF_COLS / 2) && (y < rows))
{
#pragma unroll
for (int c = 0; c < cn; ++c)
{
PROCESS_ELEM(lsmem[liy][x][c], lsmem[liy][x + BUF_COLS / 2][c]);
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if ((x == 0) && (y < rows))
{
int dst_index = mad24(y, dst_step, dst_offset);
__global dstT * dst = (__global dstT *)(dstptr + dst_index);
bufT tmp[cn] = { INIT_VALUE };
#pragma unroll
for (int xin = 0; xin < BUF_COLS / 2; xin ++)
{
#pragma unroll
for (int c = 0; c < cn; ++c)
{
PROCESS_ELEM(tmp[c], lsmem[liy][xin][c]);
}
}
#pragma unroll
for (int c = 0; c < cn; ++c)
#ifdef OCL_CV_REDUCE_AVG
dst[c] = convertToDT(convertToWT(tmp[c]) * fscale);
#else
dst[c] = convertToDT(tmp[c]);
#endif
}
}

View File

@@ -43,20 +43,18 @@
//
//M*/
#if cn != 3
#define loadpix(addr) *(__global const T *)(addr)
#if kercn != 3
#define storepix(val, addr) *(__global T *)(addr) = val
#define TSIZE (int)sizeof(T)
#define scalar scalar_
#else
#define loadpix(addr) vload3(0, (__global const T1 *)(addr))
#define storepix(val, addr) vstore3(val, 0, (__global T1 *)(addr))
#define TSIZE ((int)sizeof(T1)*3)
#define scalar (T)(scalar_.x, scalar_.y, scalar_.z)
#endif
__kernel void setIdentity(__global uchar * srcptr, int src_step, int src_offset, int rows, int cols,
ST scalar_, int rowsPerWI)
ST scalar_)
{
int x = get_global_id(0);
int y0 = get_global_id(1) * rowsPerWI;
@@ -65,7 +63,35 @@ __kernel void setIdentity(__global uchar * srcptr, int src_step, int src_offset,
{
int src_index = mad24(y0, src_step, mad24(x, TSIZE, src_offset));
for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, src_index += src_step)
storepix(x == y ? scalar : (T)(0), srcptr + src_index);
#if kercn == cn
#pragma unroll
for (int y = y0, i = 0, y1 = min(rows, y0 + rowsPerWI); i < rowsPerWI; ++y, ++i, src_index += src_step)
if (y < y1)
storepix(x == y ? scalar : (T)(0), srcptr + src_index);
#elif kercn == 4 && cn == 1
if (y0 < rows)
{
storepix(x == y0 >> 2 ? (T)(scalar, 0, 0, 0) : (T)(0), srcptr + src_index);
if (++y0 < rows)
{
src_index += src_step;
storepix(x == y0 >> 2 ? (T)(0, scalar, 0, 0) : (T)(0), srcptr + src_index);
if (++y0 < rows)
{
src_index += src_step;
storepix(x == y0 >> 2 ? (T)(0, 0, scalar, 0) : (T)(0), srcptr + src_index);
if (++y0 < rows)
{
src_index += src_step;
storepix(x == y0 >> 2 ? (T)(0, 0, 0, scalar) : (T)(0), srcptr + src_index);
}
}
}
}
#else
#error "Incorrect combination of cn && kercn"
#endif
}
}

View File

@@ -479,9 +479,10 @@ static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask
haveMask = _mask.kind() != _InputArray::NONE,
haveSrc2 = _src2.kind() != _InputArray::NONE;
int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
kercn = cn == 1 && !haveMask ? ocl::predictOptimalVectorWidth(_src) : 1,
kercn = cn == 1 && !haveMask ? ocl::predictOptimalVectorWidth(_src, _src2) : 1,
mcn = std::max(cn, kercn);
CV_Assert(!haveSrc2 || _src2.type() == type);
int convert_cn = haveSrc2 ? mcn : cn;
if ( (!doubleSupport && depth == CV_64F) || cn > 4 )
return false;
@@ -513,7 +514,7 @@ static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask
haveMask && _mask.isContinuous() ? " -D HAVE_MASK_CONT" : "", kercn,
haveSrc2 ? " -D HAVE_SRC2" : "", calc2 ? " -D OP_CALC2" : "",
haveSrc2 && _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : "",
depth <= CV_32S && ddepth == CV_32S ? ocl::convertTypeStr(CV_8U, ddepth, mcn, cvt[1]) : "noconvert");
depth <= CV_32S && ddepth == CV_32S ? ocl::convertTypeStr(CV_8U, ddepth, convert_cn, cvt[1]) : "noconvert");
ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, opts);
if (k.empty())
@@ -918,8 +919,14 @@ static bool ocl_meanStdDev( InputArray _src, OutputArray _mean, OutputArray _sdv
int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0,
isContinuous = _src.isContinuous();
int groups = ocl::Device::getDefault().maxComputeUnits();
size_t wgs = ocl::Device::getDefault().maxWorkGroupSize();
const ocl::Device &defDev = ocl::Device::getDefault();
int groups = defDev.maxComputeUnits();
if (defDev.isIntel())
{
static const int subSliceEUCount = 10;
groups = (groups / subSliceEUCount) * 2;
}
size_t wgs = defDev.maxWorkGroupSize();
int ddepth = std::max(CV_32S, depth), sqddepth = std::max(CV_32F, depth),
dtype = CV_MAKE_TYPE(ddepth, cn),
@@ -1445,6 +1452,9 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int*
CV_Assert(!haveSrc2 || _src2.type() == type);
if (depth == CV_32S || depth == CV_32F)
return false;
if ((depth == CV_64F || ddepth == CV_64F) && !doubleSupport)
return false;
@@ -2178,6 +2188,9 @@ static bool ocl_norm( InputArray _src, int normType, InputArray _mask, double &
(!doubleSupport && depth == CV_64F))
return false;
if( depth == CV_32F && (!_mask.empty() || normType == NORM_INF) )
return false;
UMat src = _src.getUMat();
if (normType == NORM_INF)
@@ -2270,7 +2283,7 @@ double cv::norm( InputArray _src, int normType, InputArray _mask )
setIppErrorStatus();
}
typedef IppStatus (CV_STDCALL* ippiMaskNormFuncC3)(const void *, int, const void *, int, IppiSize, int, Ipp64f *);
/*typedef IppStatus (CV_STDCALL* ippiMaskNormFuncC3)(const void *, int, const void *, int, IppiSize, int, Ipp64f *);
ippiMaskNormFuncC3 ippFuncC3 =
normType == NORM_INF ?
(type == CV_8UC3 ? (ippiMaskNormFuncC3)ippiNorm_Inf_8u_C3CMR :
@@ -2305,7 +2318,7 @@ double cv::norm( InputArray _src, int normType, InputArray _mask )
return normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm;
}
setIppErrorStatus();
}
}*/
}
else
{
@@ -2533,7 +2546,7 @@ static bool ocl_norm( InputArray _src1, InputArray _src2, int normType, InputArr
normType &= ~NORM_RELATIVE;
bool normsum = normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR;
if ( !(normType == NORM_INF || normsum) )
if ( !normsum || !_mask.empty() )
return false;
if (normsum)
@@ -2711,7 +2724,7 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m
0) :
normType == NORM_L1 ?
(type == CV_8UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_8u_C1MR :
type == CV_8SC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_8s_C1MR :
//type == CV_8SC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_8s_C1MR :
type == CV_16UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_16u_C1MR :
type == CV_32FC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_32f_C1MR :
0) :
@@ -2728,7 +2741,7 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m
return normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm;
setIppErrorStatus();
}
typedef IppStatus (CV_STDCALL* ippiMaskNormDiffFuncC3)(const void *, int, const void *, int, const void *, int, IppiSize, int, Ipp64f *);
/*typedef IppStatus (CV_STDCALL* ippiMaskNormDiffFuncC3)(const void *, int, const void *, int, const void *, int, IppiSize, int, Ipp64f *);
ippiMaskNormDiffFuncC3 ippFuncC3 =
normType == NORM_INF ?
(type == CV_8UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_Inf_8u_C3CMR :
@@ -2763,7 +2776,7 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m
return normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm;
}
setIppErrorStatus();
}
}*/
}
else
{

View File

@@ -829,7 +829,7 @@ OCL_TEST_P(Pow, Mat)
{
static const double pows[] = { -4, -1, -2.5, 0, 1, 2, 3.7, 4 };
for (int j = 0; j < test_loop_times; j++)
for (int j = 0; j < 1/*test_loop_times*/; j++)
for (int k = 0, size = sizeof(pows) / sizeof(double); k < size; ++k)
{
SCOPED_TRACE(pows[k]);
@@ -1203,7 +1203,7 @@ OCL_TEST_P(MinMaxIdx_Mask, Mat)
static bool relativeError(double actual, double expected, double eps)
{
return std::abs(actual - expected) / actual < eps;
return std::abs(actual - expected) < eps*(1 + std::abs(actual));
}
typedef ArithmTestBase Norm;
@@ -1230,7 +1230,7 @@ OCL_TEST_P(Norm, NORM_INF_1arg_mask)
OCL_OFF(const double cpuRes = cv::norm(src1_roi, NORM_INF, mask_roi));
OCL_ON(const double gpuRes = cv::norm(usrc1_roi, NORM_INF, umask_roi));
EXPECT_NEAR(cpuRes, gpuRes, 0.1);
EXPECT_NEAR(cpuRes, gpuRes, 0.2);
}
}
@@ -1302,7 +1302,7 @@ OCL_TEST_P(Norm, NORM_INF_2args)
OCL_OFF(const double cpuRes = cv::norm(src1_roi, src2_roi, type));
OCL_ON(const double gpuRes = cv::norm(usrc1_roi, usrc2_roi, type));
EXPECT_NEAR(cpuRes, gpuRes, 0.1);
EXPECT_NEAR(cpuRes, gpuRes, 0.2);
}
}

View File

@@ -48,17 +48,26 @@
#ifdef HAVE_OPENCL
enum OCL_FFT_TYPE
{
R2R = 0,
C2R = 1,
R2C = 2,
C2C = 3
};
namespace cvtest {
namespace ocl {
////////////////////////////////////////////////////////////////////////////
// Dft
PARAM_TEST_CASE(Dft, cv::Size, MatDepth, bool, bool, bool, bool)
PARAM_TEST_CASE(Dft, cv::Size, OCL_FFT_TYPE, bool, bool, bool, bool)
{
cv::Size dft_size;
int dft_flags, depth;
bool inplace;
int dft_flags, depth, cn, dft_type;
bool hint;
bool is1d;
TEST_DECLARE_INPUT_PARAMETER(src);
TEST_DECLARE_OUTPUT_PARAMETER(dst);
@@ -66,34 +75,50 @@ PARAM_TEST_CASE(Dft, cv::Size, MatDepth, bool, bool, bool, bool)
virtual void SetUp()
{
dft_size = GET_PARAM(0);
depth = GET_PARAM(1);
inplace = GET_PARAM(2);
dft_type = GET_PARAM(1);
depth = CV_32F;
dft_flags = 0;
switch (dft_type)
{
case R2R: dft_flags |= cv::DFT_REAL_OUTPUT; cn = 1; break;
case C2R: dft_flags |= cv::DFT_REAL_OUTPUT; cn = 2; break;
case R2C: dft_flags |= cv::DFT_COMPLEX_OUTPUT; cn = 1; break;
case C2C: dft_flags |= cv::DFT_COMPLEX_OUTPUT; cn = 2; break;
}
if (GET_PARAM(2))
dft_flags |= cv::DFT_INVERSE;
if (GET_PARAM(3))
dft_flags |= cv::DFT_ROWS;
if (GET_PARAM(4))
dft_flags |= cv::DFT_SCALE;
if (GET_PARAM(5))
dft_flags |= cv::DFT_INVERSE;
hint = GET_PARAM(5);
is1d = (dft_flags & DFT_ROWS) != 0 || dft_size.height == 1;
}
void generateTestData(int cn = 2)
void generateTestData()
{
src = randomMat(dft_size, CV_MAKE_TYPE(depth, cn), 0.0, 100.0);
usrc = src.getUMat(ACCESS_READ);
if (inplace)
dst = src, udst = usrc;
}
};
OCL_TEST_P(Dft, C2C)
OCL_TEST_P(Dft, Mat)
{
generateTestData();
OCL_OFF(cv::dft(src, dst, dft_flags | cv::DFT_COMPLEX_OUTPUT));
OCL_ON(cv::dft(usrc, udst, dft_flags | cv::DFT_COMPLEX_OUTPUT));
int nonzero_rows = hint ? src.cols - randomInt(1, src.rows-1) : 0;
OCL_OFF(cv::dft(src, dst, dft_flags, nonzero_rows));
OCL_ON(cv::dft(usrc, udst, dft_flags, nonzero_rows));
// In case forward R2C 1d tranform dst contains only half of output
// without complex conjugate
if (dft_type == R2C && is1d && (dft_flags & cv::DFT_INVERSE) == 0)
{
dst = dst(cv::Range(0, dst.rows), cv::Range(0, dst.cols/2 + 1));
udst = udst(cv::Range(0, udst.rows), cv::Range(0, udst.cols/2 + 1));
}
double eps = src.size().area() * 1e-4;
EXPECT_MAT_NEAR(dst, udst, eps);
@@ -150,15 +175,15 @@ OCL_TEST_P(MulSpectrums, Mat)
OCL_INSTANTIATE_TEST_CASE_P(OCL_ImgProc, MulSpectrums, testing::Combine(Bool(), Bool()));
OCL_INSTANTIATE_TEST_CASE_P(Core, Dft, Combine(Values(cv::Size(2, 3), cv::Size(5, 4), cv::Size(25, 20),
cv::Size(512, 1), cv::Size(1024, 768)),
Values(CV_32F, CV_64F),
Bool(), // inplace
OCL_INSTANTIATE_TEST_CASE_P(Core, Dft, Combine(Values(cv::Size(10, 10), cv::Size(36, 36), cv::Size(512, 1), cv::Size(1280, 768)),
Values((OCL_FFT_TYPE) R2C, (OCL_FFT_TYPE) C2C, (OCL_FFT_TYPE) R2R, (OCL_FFT_TYPE) C2R),
Bool(), // DFT_INVERSE
Bool(), // DFT_ROWS
Bool(), // DFT_SCALE
Bool()) // DFT_INVERSE
Bool() // hint
)
);
} } // namespace cvtest::ocl
#endif // HAVE_OPENCL
#endif // HAVE_OPENCL