Merge remote-tracking branch 'origin/master' into merge-2.4
This commit is contained in:
@@ -2981,7 +2981,7 @@ The class provides the following features for all derived classes:
|
||||
|
||||
* so called "virtual constructor". That is, each Algorithm derivative is registered at program start and you can get the list of registered algorithms and create instance of a particular algorithm by its name (see ``Algorithm::create``). If you plan to add your own algorithms, it is good practice to add a unique prefix to your algorithms to distinguish them from other algorithms.
|
||||
|
||||
* setting/retrieving algorithm parameters by name. If you used video capturing functionality from OpenCV highgui module, you are probably familar with ``cvSetCaptureProperty()``, ``cvGetCaptureProperty()``, ``VideoCapture::set()`` and ``VideoCapture::get()``. ``Algorithm`` provides similar method where instead of integer id's you specify the parameter names as text strings. See ``Algorithm::set`` and ``Algorithm::get`` for details.
|
||||
* setting/retrieving algorithm parameters by name. If you used video capturing functionality from OpenCV videoio module, you are probably familar with ``cvSetCaptureProperty()``, ``cvGetCaptureProperty()``, ``VideoCapture::set()`` and ``VideoCapture::get()``. ``Algorithm`` provides similar method where instead of integer id's you specify the parameter names as text strings. See ``Algorithm::set`` and ``Algorithm::get`` for details.
|
||||
|
||||
* reading and writing parameters from/to XML or YAML files. Every Algorithm derivative can store all its parameters and then read them back. There is no need to re-implement it each time.
|
||||
|
||||
|
||||
@@ -361,6 +361,37 @@ The function ``line`` draws the line segment between ``pt1`` and ``pt2`` points
|
||||
Antialiased lines are drawn using Gaussian filtering.
|
||||
|
||||
|
||||
arrowedLine
|
||||
----------------
|
||||
Draws a arrow segment pointing from the first point to the second one.
|
||||
|
||||
.. ocv:function:: void arrowedLine(InputOutputArray img, Point pt1, Point pt2, const Scalar& color, int thickness=1, int lineType=8, int shift=0, double tipLength=0.1)
|
||||
|
||||
:param img: Image.
|
||||
|
||||
:param pt1: The point the arrow starts from.
|
||||
|
||||
:param pt2: The point the arrow points to.
|
||||
|
||||
:param color: Line color.
|
||||
|
||||
:param thickness: Line thickness.
|
||||
|
||||
:param lineType: Type of the line:
|
||||
|
||||
* **8** (or omitted) - 8-connected line.
|
||||
|
||||
* **4** - 4-connected line.
|
||||
|
||||
* **CV_AA** - antialiased line.
|
||||
|
||||
:param shift: Number of fractional bits in the point coordinates.
|
||||
|
||||
:param tipLength: The length of the arrow tip in relation to the arrow length
|
||||
|
||||
The function ``arrowedLine`` draws an arrow between ``pt1`` and ``pt2`` points in the image. See also :ocv:func:`line`.
|
||||
|
||||
|
||||
LineIterator
|
||||
------------
|
||||
.. ocv:class:: LineIterator
|
||||
|
||||
@@ -14,7 +14,8 @@ OpenCV has a modular structure, which means that the package includes several sh
|
||||
* **calib3d** - basic multiple-view geometry algorithms, single and stereo camera calibration, object pose estimation, stereo correspondence algorithms, and elements of 3D reconstruction.
|
||||
* **features2d** - salient feature detectors, descriptors, and descriptor matchers.
|
||||
* **objdetect** - detection of objects and instances of the predefined classes (for example, faces, eyes, mugs, people, cars, and so on).
|
||||
* **highgui** - an easy-to-use interface to video capturing, image and video codecs, as well as simple UI capabilities.
|
||||
* **highgui** - an easy-to-use interface to simple UI capabilities.
|
||||
* **videoio** - an easy-to-use interface to video capturing and video codecs.
|
||||
* **gpu** - GPU-accelerated algorithms from different OpenCV modules.
|
||||
* ... some other helper modules, such as FLANN and Google test wrappers, Python bindings, and others.
|
||||
|
||||
|
||||
@@ -510,6 +510,10 @@ CV_EXPORTS_W void randShuffle(InputOutputArray dst, double iterFactor = 1., RNG*
|
||||
CV_EXPORTS_W void line(InputOutputArray img, Point pt1, Point pt2, const Scalar& color,
|
||||
int thickness = 1, int lineType = LINE_8, int shift = 0);
|
||||
|
||||
//! draws an arrow from pt1 to pt2 in the image
|
||||
CV_EXPORTS_W void arrowedLine(InputOutputArray img, Point pt1, Point pt2, const Scalar& color,
|
||||
int thickness=1, int line_type=8, int shift=0, double tipLength=0.1);
|
||||
|
||||
//! draws the rectangle outline or a solid rectangle with the opposite corners pt1 and pt2 in the image
|
||||
CV_EXPORTS_W void rectangle(InputOutputArray img, Point pt1, Point pt2,
|
||||
const Scalar& color, int thickness = 1,
|
||||
|
||||
@@ -244,6 +244,7 @@ typedef signed char schar;
|
||||
|
||||
/* fundamental constants */
|
||||
#define CV_PI 3.1415926535897932384626433832795
|
||||
#define CV_2PI 6.283185307179586476925286766559
|
||||
#define CV_LOG2 0.69314718055994530941723212145818
|
||||
|
||||
/****************************************************************************************\
|
||||
|
||||
@@ -360,7 +360,7 @@ struct CV_EXPORTS UMatData
|
||||
{
|
||||
enum { COPY_ON_MAP=1, HOST_COPY_OBSOLETE=2,
|
||||
DEVICE_COPY_OBSOLETE=4, TEMP_UMAT=8, TEMP_COPIED_UMAT=24,
|
||||
USER_ALLOCATED=32 };
|
||||
USER_ALLOCATED=32, DEVICE_MEM_MAPPED=64};
|
||||
UMatData(const MatAllocator* allocator);
|
||||
~UMatData();
|
||||
|
||||
@@ -370,11 +370,13 @@ struct CV_EXPORTS UMatData
|
||||
|
||||
bool hostCopyObsolete() const;
|
||||
bool deviceCopyObsolete() const;
|
||||
bool deviceMemMapped() const;
|
||||
bool copyOnMap() const;
|
||||
bool tempUMat() const;
|
||||
bool tempCopiedUMat() const;
|
||||
void markHostCopyObsolete(bool flag);
|
||||
void markDeviceCopyObsolete(bool flag);
|
||||
void markDeviceMemMapped(bool flag);
|
||||
|
||||
const MatAllocator* prevAllocator;
|
||||
const MatAllocator* currAllocator;
|
||||
|
||||
@@ -3351,10 +3351,19 @@ size_t UMat::total() const
|
||||
|
||||
inline bool UMatData::hostCopyObsolete() const { return (flags & HOST_COPY_OBSOLETE) != 0; }
|
||||
inline bool UMatData::deviceCopyObsolete() const { return (flags & DEVICE_COPY_OBSOLETE) != 0; }
|
||||
inline bool UMatData::deviceMemMapped() const { return (flags & DEVICE_MEM_MAPPED) != 0; }
|
||||
inline bool UMatData::copyOnMap() const { return (flags & COPY_ON_MAP) != 0; }
|
||||
inline bool UMatData::tempUMat() const { return (flags & TEMP_UMAT) != 0; }
|
||||
inline bool UMatData::tempCopiedUMat() const { return (flags & TEMP_COPIED_UMAT) == TEMP_COPIED_UMAT; }
|
||||
|
||||
inline void UMatData::markDeviceMemMapped(bool flag)
|
||||
{
|
||||
if(flag)
|
||||
flags |= DEVICE_MEM_MAPPED;
|
||||
else
|
||||
flags &= ~DEVICE_MEM_MAPPED;
|
||||
}
|
||||
|
||||
inline void UMatData::markHostCopyObsolete(bool flag)
|
||||
{
|
||||
if(flag)
|
||||
|
||||
@@ -54,23 +54,42 @@ namespace ocl {
|
||||
|
||||
///////////// dft ////////////////////////
|
||||
|
||||
typedef tuple<Size, int> DftParams;
|
||||
enum OCL_FFT_TYPE
|
||||
{
|
||||
R2R = 0,
|
||||
C2R = 1,
|
||||
R2C = 2,
|
||||
C2C = 3
|
||||
};
|
||||
|
||||
typedef tuple<OCL_FFT_TYPE, Size, int> DftParams;
|
||||
typedef TestBaseWithParam<DftParams> DftFixture;
|
||||
|
||||
OCL_PERF_TEST_P(DftFixture, Dft, ::testing::Combine(Values(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3),
|
||||
Values((int)DFT_ROWS, (int)DFT_SCALE, (int)DFT_INVERSE,
|
||||
(int)DFT_INVERSE | DFT_SCALE, (int)DFT_ROWS | DFT_INVERSE)))
|
||||
OCL_PERF_TEST_P(DftFixture, Dft, ::testing::Combine(Values(C2C, R2R, C2R, R2C),
|
||||
Values(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3, Size(512, 512), Size(1024, 1024), Size(2048, 2048)),
|
||||
Values((int) 0, (int)DFT_ROWS, (int)DFT_SCALE, (int)DFT_INVERSE,
|
||||
(int)DFT_INVERSE | DFT_SCALE, (int)DFT_ROWS | DFT_INVERSE)))
|
||||
{
|
||||
const DftParams params = GetParam();
|
||||
const Size srcSize = get<0>(params);
|
||||
const int flags = get<1>(params);
|
||||
const int dft_type = get<0>(params);
|
||||
const Size srcSize = get<1>(params);
|
||||
int flags = get<2>(params);
|
||||
|
||||
UMat src(srcSize, CV_32FC2), dst(srcSize, CV_32FC2);
|
||||
int in_cn, out_cn;
|
||||
switch (dft_type)
|
||||
{
|
||||
case R2R: flags |= cv::DFT_REAL_OUTPUT; in_cn = 1; out_cn = 1; break;
|
||||
case C2R: flags |= cv::DFT_REAL_OUTPUT; in_cn = 2; out_cn = 2; break;
|
||||
case R2C: flags |= cv::DFT_COMPLEX_OUTPUT; in_cn = 1; out_cn = 2; break;
|
||||
case C2C: flags |= cv::DFT_COMPLEX_OUTPUT; in_cn = 2; out_cn = 2; break;
|
||||
}
|
||||
|
||||
UMat src(srcSize, CV_MAKE_TYPE(CV_32F, in_cn)), dst(srcSize, CV_MAKE_TYPE(CV_32F, out_cn));
|
||||
declare.in(src, WARMUP_RNG).out(dst);
|
||||
|
||||
OCL_TEST_CYCLE() cv::dft(src, dst, flags | DFT_COMPLEX_OUTPUT);
|
||||
OCL_TEST_CYCLE() cv::dft(src, dst, flags);
|
||||
|
||||
SANITY_CHECK(dst, 1e-3);
|
||||
SANITY_CHECK(dst, 1e-5, ERROR_RELATIVE);
|
||||
}
|
||||
|
||||
///////////// MulSpectrums ////////////////////////
|
||||
|
||||
@@ -139,6 +139,7 @@ OCL_PERF_TEST_P(CopyToFixture, CopyToWithMaskUninit,
|
||||
dst.release();
|
||||
startTimer();
|
||||
src.copyTo(dst, mask);
|
||||
cv::ocl::finish();
|
||||
stopTimer();
|
||||
}
|
||||
|
||||
|
||||
@@ -54,21 +54,23 @@ namespace cv
|
||||
|
||||
struct NOP {};
|
||||
|
||||
#if CV_SSE2
|
||||
#if CV_SSE2 || CV_NEON
|
||||
|
||||
#define FUNCTOR_TEMPLATE(name) \
|
||||
template<typename T> struct name {}
|
||||
|
||||
FUNCTOR_TEMPLATE(VLoadStore128);
|
||||
#if CV_SSE2
|
||||
FUNCTOR_TEMPLATE(VLoadStore64);
|
||||
FUNCTOR_TEMPLATE(VLoadStore128Aligned);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
template<typename T, class Op, class VOp>
|
||||
void vBinOp(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, size_t step, Size sz)
|
||||
{
|
||||
#if CV_SSE2
|
||||
#if CV_SSE2 || CV_NEON
|
||||
VOp vop;
|
||||
#endif
|
||||
Op op;
|
||||
@@ -79,9 +81,11 @@ void vBinOp(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, si
|
||||
{
|
||||
int x = 0;
|
||||
|
||||
#if CV_NEON || CV_SSE2
|
||||
#if CV_SSE2
|
||||
if( USE_SSE2 )
|
||||
{
|
||||
#endif
|
||||
for( ; x <= sz.width - 32/(int)sizeof(T); x += 32/sizeof(T) )
|
||||
{
|
||||
typename VLoadStore128<T>::reg_type r0 = VLoadStore128<T>::load(src1 + x );
|
||||
@@ -91,8 +95,10 @@ void vBinOp(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, si
|
||||
VLoadStore128<T>::store(dst + x , r0);
|
||||
VLoadStore128<T>::store(dst + x + 16/sizeof(T), r1);
|
||||
}
|
||||
#if CV_SSE2
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
#if CV_SSE2
|
||||
if( USE_SSE2 )
|
||||
{
|
||||
@@ -125,7 +131,7 @@ template<typename T, class Op, class Op32>
|
||||
void vBinOp32(const T* src1, size_t step1, const T* src2, size_t step2,
|
||||
T* dst, size_t step, Size sz)
|
||||
{
|
||||
#if CV_SSE2
|
||||
#if CV_SSE2 || CV_NEON
|
||||
Op32 op32;
|
||||
#endif
|
||||
Op op;
|
||||
@@ -153,9 +159,11 @@ void vBinOp32(const T* src1, size_t step1, const T* src2, size_t step2,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if CV_NEON || CV_SSE2
|
||||
#if CV_SSE2
|
||||
if( USE_SSE2 )
|
||||
{
|
||||
#endif
|
||||
for( ; x <= sz.width - 8; x += 8 )
|
||||
{
|
||||
typename VLoadStore128<T>::reg_type r0 = VLoadStore128<T>::load(src1 + x );
|
||||
@@ -165,8 +173,10 @@ void vBinOp32(const T* src1, size_t step1, const T* src2, size_t step2,
|
||||
VLoadStore128<T>::store(dst + x , r0);
|
||||
VLoadStore128<T>::store(dst + x + 4, r1);
|
||||
}
|
||||
#if CV_SSE2
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
#if CV_ENABLE_UNROLLED
|
||||
for( ; x <= sz.width - 4; x += 4 )
|
||||
{
|
||||
@@ -383,7 +393,98 @@ FUNCTOR_TEMPLATE(VNot);
|
||||
FUNCTOR_CLOSURE_1arg(VNot, uchar, return _mm_xor_si128(_mm_set1_epi32(-1), a));
|
||||
#endif
|
||||
|
||||
#if CV_SSE2
|
||||
#if CV_NEON
|
||||
|
||||
#define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body)\
|
||||
template <> \
|
||||
struct name<template_arg>{ \
|
||||
typedef register_type reg_type; \
|
||||
static reg_type load(const template_arg * p) { return load_body (p);}; \
|
||||
static void store(template_arg * p, reg_type v) { store_body (p, v);}; \
|
||||
}
|
||||
|
||||
#define FUNCTOR_CLOSURE_2arg(name, template_arg, body)\
|
||||
template<> \
|
||||
struct name<template_arg> \
|
||||
{ \
|
||||
VLoadStore128<template_arg>::reg_type operator()( \
|
||||
VLoadStore128<template_arg>::reg_type a, \
|
||||
VLoadStore128<template_arg>::reg_type b) const \
|
||||
{ \
|
||||
return body; \
|
||||
}; \
|
||||
}
|
||||
|
||||
#define FUNCTOR_CLOSURE_1arg(name, template_arg, body)\
|
||||
template<> \
|
||||
struct name<template_arg> \
|
||||
{ \
|
||||
VLoadStore128<template_arg>::reg_type operator()( \
|
||||
VLoadStore128<template_arg>::reg_type a, \
|
||||
VLoadStore128<template_arg>::reg_type ) const \
|
||||
{ \
|
||||
return body; \
|
||||
}; \
|
||||
}
|
||||
|
||||
FUNCTOR_LOADSTORE(VLoadStore128, uchar, uint8x16_t, vld1q_u8 , vst1q_u8 );
|
||||
FUNCTOR_LOADSTORE(VLoadStore128, schar, int8x16_t, vld1q_s8 , vst1q_s8 );
|
||||
FUNCTOR_LOADSTORE(VLoadStore128, ushort, uint16x8_t, vld1q_u16, vst1q_u16);
|
||||
FUNCTOR_LOADSTORE(VLoadStore128, short, int16x8_t, vld1q_s16, vst1q_s16);
|
||||
FUNCTOR_LOADSTORE(VLoadStore128, int, int32x4_t, vld1q_s32, vst1q_s32);
|
||||
FUNCTOR_LOADSTORE(VLoadStore128, float, float32x4_t, vld1q_f32, vst1q_f32);
|
||||
|
||||
FUNCTOR_TEMPLATE(VAdd);
|
||||
FUNCTOR_CLOSURE_2arg(VAdd, uchar, vqaddq_u8 (a, b));
|
||||
FUNCTOR_CLOSURE_2arg(VAdd, schar, vqaddq_s8 (a, b));
|
||||
FUNCTOR_CLOSURE_2arg(VAdd, ushort, vqaddq_u16(a, b));
|
||||
FUNCTOR_CLOSURE_2arg(VAdd, short, vqaddq_s16(a, b));
|
||||
FUNCTOR_CLOSURE_2arg(VAdd, int, vaddq_s32 (a, b));
|
||||
FUNCTOR_CLOSURE_2arg(VAdd, float, vaddq_f32 (a, b));
|
||||
|
||||
FUNCTOR_TEMPLATE(VSub);
|
||||
FUNCTOR_CLOSURE_2arg(VSub, uchar, vqsubq_u8 (a, b));
|
||||
FUNCTOR_CLOSURE_2arg(VSub, schar, vqsubq_s8 (a, b));
|
||||
FUNCTOR_CLOSURE_2arg(VSub, ushort, vqsubq_u16(a, b));
|
||||
FUNCTOR_CLOSURE_2arg(VSub, short, vqsubq_s16(a, b));
|
||||
FUNCTOR_CLOSURE_2arg(VSub, int, vsubq_s32 (a, b));
|
||||
FUNCTOR_CLOSURE_2arg(VSub, float, vsubq_f32 (a, b));
|
||||
|
||||
FUNCTOR_TEMPLATE(VMin);
|
||||
FUNCTOR_CLOSURE_2arg(VMin, uchar, vminq_u8 (a, b));
|
||||
FUNCTOR_CLOSURE_2arg(VMin, schar, vminq_s8 (a, b));
|
||||
FUNCTOR_CLOSURE_2arg(VMin, ushort, vminq_u16(a, b));
|
||||
FUNCTOR_CLOSURE_2arg(VMin, short, vminq_s16(a, b));
|
||||
FUNCTOR_CLOSURE_2arg(VMin, int, vminq_s32(a, b));
|
||||
FUNCTOR_CLOSURE_2arg(VMin, float, vminq_f32(a, b));
|
||||
|
||||
FUNCTOR_TEMPLATE(VMax);
|
||||
FUNCTOR_CLOSURE_2arg(VMax, uchar, vmaxq_u8 (a, b));
|
||||
FUNCTOR_CLOSURE_2arg(VMax, schar, vmaxq_s8 (a, b));
|
||||
FUNCTOR_CLOSURE_2arg(VMax, ushort, vmaxq_u16(a, b));
|
||||
FUNCTOR_CLOSURE_2arg(VMax, short, vmaxq_s16(a, b));
|
||||
FUNCTOR_CLOSURE_2arg(VMax, int, vmaxq_s32(a, b));
|
||||
FUNCTOR_CLOSURE_2arg(VMax, float, vmaxq_f32(a, b));
|
||||
|
||||
FUNCTOR_TEMPLATE(VAbsDiff);
|
||||
FUNCTOR_CLOSURE_2arg(VAbsDiff, uchar, vabdq_u8 (a, b));
|
||||
FUNCTOR_CLOSURE_2arg(VAbsDiff, schar, vqabsq_s8 (vqsubq_s8(a, b)));
|
||||
FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort, vabdq_u16 (a, b));
|
||||
FUNCTOR_CLOSURE_2arg(VAbsDiff, short, vqabsq_s16(vqsubq_s16(a, b)));
|
||||
FUNCTOR_CLOSURE_2arg(VAbsDiff, int, vabdq_s32 (a, b));
|
||||
FUNCTOR_CLOSURE_2arg(VAbsDiff, float, vabdq_f32 (a, b));
|
||||
|
||||
FUNCTOR_TEMPLATE(VAnd);
|
||||
FUNCTOR_CLOSURE_2arg(VAnd, uchar, vandq_u8(a, b));
|
||||
FUNCTOR_TEMPLATE(VOr);
|
||||
FUNCTOR_CLOSURE_2arg(VOr , uchar, vorrq_u8(a, b));
|
||||
FUNCTOR_TEMPLATE(VXor);
|
||||
FUNCTOR_CLOSURE_2arg(VXor, uchar, veorq_u8(a, b));
|
||||
FUNCTOR_TEMPLATE(VNot);
|
||||
FUNCTOR_CLOSURE_1arg(VNot, uchar, vmvnq_u8(a ));
|
||||
#endif
|
||||
|
||||
#if CV_SSE2 || CV_NEON
|
||||
#define IF_SIMD(op) op
|
||||
#else
|
||||
#define IF_SIMD(op) NOP
|
||||
@@ -1390,6 +1491,9 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
|
||||
if (!doubleSupport && (depth2 == CV_64F || depth1 == CV_64F))
|
||||
return false;
|
||||
|
||||
if( (oclop == OCL_OP_MUL_SCALE || oclop == OCL_OP_DIV_SCALE) && (depth1 >= CV_32F || depth2 >= CV_32F || ddepth >= CV_32F) )
|
||||
return false;
|
||||
|
||||
int kercn = haveMask || haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst);
|
||||
int scalarcn = kercn == 3 ? 4 : kercn, rowsPerWI = d.isIntel() ? 4 : 1;
|
||||
|
||||
@@ -2980,8 +3084,187 @@ void cv::compare(InputArray _src1, InputArray _src2, OutputArray _dst, int op)
|
||||
namespace cv
|
||||
{
|
||||
|
||||
template<typename T> static void
|
||||
inRange_(const T* src1, size_t step1, const T* src2, size_t step2,
|
||||
template <typename T>
|
||||
struct InRange_SSE
|
||||
{
|
||||
int operator () (const T *, const T *, const T *, uchar *, int) const
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
|
||||
#if CV_SSE2
|
||||
|
||||
template <>
|
||||
struct InRange_SSE<uchar>
|
||||
{
|
||||
int operator () (const uchar * src1, const uchar * src2, const uchar * src3,
|
||||
uchar * dst, int len) const
|
||||
{
|
||||
int x = 0;
|
||||
|
||||
if (USE_SSE2)
|
||||
{
|
||||
__m128i v_full = _mm_set1_epi8(-1), v_128 = _mm_set1_epi8(-128);
|
||||
|
||||
for ( ; x <= len - 16; x += 16 )
|
||||
{
|
||||
__m128i v_src = _mm_add_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)), v_128);
|
||||
__m128i v_mask1 = _mm_cmpgt_epi8(_mm_add_epi8(_mm_loadu_si128((const __m128i *)(src2 + x)), v_128), v_src);
|
||||
__m128i v_mask2 = _mm_cmpgt_epi8(v_src, _mm_add_epi8(_mm_loadu_si128((const __m128i *)(src3 + x)), v_128));
|
||||
_mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(_mm_or_si128(v_mask1, v_mask2), v_full));
|
||||
}
|
||||
}
|
||||
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct InRange_SSE<schar>
|
||||
{
|
||||
int operator () (const schar * src1, const schar * src2, const schar * src3,
|
||||
uchar * dst, int len) const
|
||||
{
|
||||
int x = 0;
|
||||
|
||||
if (USE_SSE2)
|
||||
{
|
||||
__m128i v_full = _mm_set1_epi8(-1);
|
||||
|
||||
for ( ; x <= len - 16; x += 16 )
|
||||
{
|
||||
__m128i v_src = _mm_loadu_si128((const __m128i *)(src1 + x));
|
||||
__m128i v_mask1 = _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src2 + x)), v_src);
|
||||
__m128i v_mask2 = _mm_cmpgt_epi8(v_src, _mm_loadu_si128((const __m128i *)(src3 + x)));
|
||||
_mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(_mm_or_si128(v_mask1, v_mask2), v_full));
|
||||
}
|
||||
}
|
||||
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct InRange_SSE<ushort>
|
||||
{
|
||||
int operator () (const ushort * src1, const ushort * src2, const ushort * src3,
|
||||
uchar * dst, int len) const
|
||||
{
|
||||
int x = 0;
|
||||
|
||||
if (USE_SSE2)
|
||||
{
|
||||
__m128i v_zero = _mm_setzero_si128(), v_full = _mm_set1_epi16(-1), v_32768 = _mm_set1_epi16(-32768);
|
||||
|
||||
for ( ; x <= len - 8; x += 8 )
|
||||
{
|
||||
__m128i v_src = _mm_add_epi16(_mm_loadu_si128((const __m128i *)(src1 + x)), v_32768);
|
||||
__m128i v_mask1 = _mm_cmpgt_epi16(_mm_add_epi16(_mm_loadu_si128((const __m128i *)(src2 + x)), v_32768), v_src);
|
||||
__m128i v_mask2 = _mm_cmpgt_epi16(v_src, _mm_add_epi16(_mm_loadu_si128((const __m128i *)(src3 + x)), v_32768));
|
||||
__m128i v_res = _mm_andnot_si128(_mm_or_si128(v_mask1, v_mask2), v_full);
|
||||
_mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(_mm_srli_epi16(v_res, 8), v_zero));
|
||||
}
|
||||
}
|
||||
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct InRange_SSE<short>
|
||||
{
|
||||
int operator () (const short * src1, const short * src2, const short * src3,
|
||||
uchar * dst, int len) const
|
||||
{
|
||||
int x = 0;
|
||||
|
||||
if (USE_SSE2)
|
||||
{
|
||||
__m128i v_zero = _mm_setzero_si128(), v_full = _mm_set1_epi16(-1);
|
||||
|
||||
for ( ; x <= len - 8; x += 8 )
|
||||
{
|
||||
__m128i v_src = _mm_loadu_si128((const __m128i *)(src1 + x));
|
||||
__m128i v_mask1 = _mm_cmpgt_epi16(_mm_loadu_si128((const __m128i *)(src2 + x)), v_src);
|
||||
__m128i v_mask2 = _mm_cmpgt_epi16(v_src, _mm_loadu_si128((const __m128i *)(src3 + x)));
|
||||
__m128i v_res = _mm_andnot_si128(_mm_or_si128(v_mask1, v_mask2), v_full);
|
||||
_mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(_mm_srli_epi16(v_res, 8), v_zero));
|
||||
}
|
||||
}
|
||||
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct InRange_SSE<int>
|
||||
{
|
||||
int operator () (const int * src1, const int * src2, const int * src3,
|
||||
uchar * dst, int len) const
|
||||
{
|
||||
int x = 0;
|
||||
|
||||
if (USE_SSE2)
|
||||
{
|
||||
__m128i v_zero = _mm_setzero_si128(), v_full = _mm_set1_epi32(-1);
|
||||
|
||||
for ( ; x <= len - 8; x += 8 )
|
||||
{
|
||||
__m128i v_src = _mm_loadu_si128((const __m128i *)(src1 + x));
|
||||
__m128i v_res1 = _mm_or_si128(_mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src2 + x)), v_src),
|
||||
_mm_cmpgt_epi32(v_src, _mm_loadu_si128((const __m128i *)(src3 + x))));
|
||||
|
||||
v_src = _mm_loadu_si128((const __m128i *)(src1 + x + 4));
|
||||
__m128i v_res2 = _mm_or_si128(_mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src2 + x + 4)), v_src),
|
||||
_mm_cmpgt_epi32(v_src, _mm_loadu_si128((const __m128i *)(src3 + x + 4))));
|
||||
|
||||
__m128i v_res = _mm_packs_epi32(_mm_srli_epi32(_mm_andnot_si128(v_res1, v_full), 16),
|
||||
_mm_srli_epi32(_mm_andnot_si128(v_res2, v_full), 16));
|
||||
_mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_res, v_zero));
|
||||
}
|
||||
}
|
||||
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct InRange_SSE<float>
|
||||
{
|
||||
int operator () (const float * src1, const float * src2, const float * src3,
|
||||
uchar * dst, int len) const
|
||||
{
|
||||
int x = 0;
|
||||
|
||||
if (USE_SSE2)
|
||||
{
|
||||
__m128i v_zero = _mm_setzero_si128();
|
||||
|
||||
for ( ; x <= len - 8; x += 8 )
|
||||
{
|
||||
__m128 v_src = _mm_loadu_ps(src1 + x);
|
||||
__m128 v_res1 = _mm_and_ps(_mm_cmple_ps(_mm_loadu_ps(src2 + x), v_src),
|
||||
_mm_cmple_ps(v_src, _mm_loadu_ps(src3 + x)));
|
||||
|
||||
v_src = _mm_loadu_ps(src1 + x + 4);
|
||||
__m128 v_res2 = _mm_and_ps(_mm_cmple_ps(_mm_loadu_ps(src2 + x + 4), v_src),
|
||||
_mm_cmple_ps(v_src, _mm_loadu_ps(src3 + x + 4)));
|
||||
|
||||
__m128i v_res1i = _mm_cvtps_epi32(v_res1), v_res2i = _mm_cvtps_epi32(v_res2);
|
||||
__m128i v_res = _mm_packs_epi32(_mm_srli_epi32(v_res1i, 16), _mm_srli_epi32(v_res2i, 16));
|
||||
_mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_res, v_zero));
|
||||
}
|
||||
}
|
||||
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
template <typename T>
|
||||
static void inRange_(const T* src1, size_t step1, const T* src2, size_t step2,
|
||||
const T* src3, size_t step3, uchar* dst, size_t step,
|
||||
Size size)
|
||||
{
|
||||
@@ -2989,9 +3272,11 @@ inRange_(const T* src1, size_t step1, const T* src2, size_t step2,
|
||||
step2 /= sizeof(src2[0]);
|
||||
step3 /= sizeof(src3[0]);
|
||||
|
||||
InRange_SSE<T> vop;
|
||||
|
||||
for( ; size.height--; src1 += step1, src2 += step2, src3 += step3, dst += step )
|
||||
{
|
||||
int x = 0;
|
||||
int x = vop(src1, src2, src3, dst, size.width);
|
||||
#if CV_ENABLE_UNROLLED
|
||||
for( ; x <= size.width - 4; x += 4 )
|
||||
{
|
||||
|
||||
@@ -1541,7 +1541,7 @@ static bool ocl_convertScaleAbs( InputArray _src, OutputArray _dst, double alpha
|
||||
kercn = ocl::predictOptimalVectorWidth(_src, _dst), rowsPerWI = d.isIntel() ? 4 : 1;
|
||||
bool doubleSupport = d.doubleFPConfig() > 0;
|
||||
|
||||
if (!doubleSupport && depth == CV_64F)
|
||||
if (depth == CV_32F || depth == CV_64F)
|
||||
return false;
|
||||
|
||||
char cvt[2][50];
|
||||
@@ -1729,22 +1729,18 @@ static bool ocl_LUT(InputArray _src, InputArray _lut, OutputArray _dst)
|
||||
UMat src = _src.getUMat(), lut = _lut.getUMat();
|
||||
_dst.create(src.size(), CV_MAKETYPE(ddepth, dcn));
|
||||
UMat dst = _dst.getUMat();
|
||||
bool bAligned = (1 == lcn) && (0 == (src.offset % 4)) && (0 == ((dcn * src.cols) % 4));
|
||||
// dst.cols == src.cols by params of dst.create
|
||||
int kercn = lcn == 1 ? std::min(4, ocl::predictOptimalVectorWidth(_dst)) : dcn;
|
||||
|
||||
ocl::Kernel k("LUT", ocl::core::lut_oclsrc,
|
||||
format("-D dcn=%d -D lcn=%d -D srcT=%s -D dstT=%s", bAligned ? 4 : dcn, lcn,
|
||||
ocl::typeToStr(src.depth()), ocl::memopTypeToStr(ddepth)
|
||||
));
|
||||
format("-D dcn=%d -D lcn=%d -D srcT=%s -D dstT=%s", kercn, lcn,
|
||||
ocl::typeToStr(src.depth()), ocl::memopTypeToStr(ddepth)));
|
||||
if (k.empty())
|
||||
return false;
|
||||
|
||||
int cols = bAligned ? dcn * dst.cols / 4 : dst.cols;
|
||||
|
||||
k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::ReadOnlyNoSize(lut),
|
||||
ocl::KernelArg::WriteOnlyNoSize(dst), dst.rows, cols);
|
||||
ocl::KernelArg::WriteOnly(dst, dcn, kercn));
|
||||
|
||||
size_t globalSize[2] = { cols, (dst.rows + 3) / 4 };
|
||||
size_t globalSize[2] = { dst.cols * dcn / kercn, (dst.rows + 3) / 4 };
|
||||
return k.run(2, globalSize, NULL, false);
|
||||
}
|
||||
|
||||
|
||||
@@ -432,7 +432,7 @@ Mat& Mat::setTo(InputArray _value, InputArray _mask)
|
||||
|
||||
IppStatus status = (IppStatus)-1;
|
||||
IppiSize roisize = { cols, rows };
|
||||
int mstep = (int)mask.step, dstep = (int)step;
|
||||
int mstep = (int)mask.step[0], dstep = (int)step[0];
|
||||
|
||||
if (isContinuous() && mask.isContinuous())
|
||||
{
|
||||
@@ -616,7 +616,7 @@ static bool ocl_flip(InputArray _src, OutputArray _dst, int flipCode )
|
||||
{
|
||||
CV_Assert(flipCode >= -1 && flipCode <= 1);
|
||||
int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
|
||||
flipType, kercn = std::min(ocl::predictOptimalVectorWidth(_src, _dst), 4);;
|
||||
flipType, kercn = std::min(ocl::predictOptimalVectorWidth(_src, _dst), 4);
|
||||
|
||||
if (cn > 4)
|
||||
return false;
|
||||
@@ -631,7 +631,7 @@ static bool ocl_flip(InputArray _src, OutputArray _dst, int flipCode )
|
||||
|
||||
ocl::Device dev = ocl::Device::getDefault();
|
||||
int pxPerWIy = (dev.isIntel() && (dev.type() & ocl::Device::TYPE_GPU)) ? 4 : 1;
|
||||
kercn = std::max(kercn, cn);
|
||||
kercn = (cn!=3 || flipType == FLIP_ROWS) ? std::max(kercn, cn) : cn;
|
||||
|
||||
ocl::Kernel k(kernelName, ocl::core::flip_oclsrc,
|
||||
format( "-D T=%s -D T1=%s -D cn=%d -D PIX_PER_WI_Y=%d -D kercn=%d",
|
||||
@@ -762,7 +762,7 @@ void flip( InputArray _src, OutputArray _dst, int flip_mode )
|
||||
flipHoriz( dst.data, dst.step, dst.data, dst.step, dst.size(), esz );
|
||||
}
|
||||
|
||||
#ifdef HAVE_OPENCL
|
||||
/*#ifdef HAVE_OPENCL
|
||||
|
||||
static bool ocl_repeat(InputArray _src, int ny, int nx, OutputArray _dst)
|
||||
{
|
||||
@@ -790,7 +790,7 @@ static bool ocl_repeat(InputArray _src, int ny, int nx, OutputArray _dst)
|
||||
return k.run(2, globalsize, NULL, false);
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif*/
|
||||
|
||||
void repeat(InputArray _src, int ny, int nx, OutputArray _dst)
|
||||
{
|
||||
@@ -800,8 +800,8 @@ void repeat(InputArray _src, int ny, int nx, OutputArray _dst)
|
||||
Size ssize = _src.size();
|
||||
_dst.create(ssize.height*ny, ssize.width*nx, _src.type());
|
||||
|
||||
CV_OCL_RUN(_dst.isUMat(),
|
||||
ocl_repeat(_src, ny, nx, _dst))
|
||||
/*CV_OCL_RUN(_dst.isUMat(),
|
||||
ocl_repeat(_src, ny, nx, _dst))*/
|
||||
|
||||
Mat src = _src.getMat(), dst = _dst.getMat();
|
||||
Size dsize = dst.size();
|
||||
|
||||
@@ -207,7 +207,6 @@ namespace
|
||||
MemoryStack* MemoryPool::getFreeMemStack()
|
||||
{
|
||||
AutoLock lock(mtx_);
|
||||
|
||||
if (!initialized_)
|
||||
initilizeImpl();
|
||||
|
||||
@@ -256,22 +255,31 @@ namespace
|
||||
|
||||
namespace
|
||||
{
|
||||
Mutex mtx_;
|
||||
bool memory_pool_manager_initialized;
|
||||
|
||||
class MemoryPoolManager
|
||||
{
|
||||
public:
|
||||
MemoryPoolManager();
|
||||
~MemoryPoolManager();
|
||||
void Init();
|
||||
|
||||
MemoryPool* getPool(int deviceId);
|
||||
|
||||
private:
|
||||
std::vector<MemoryPool> pools_;
|
||||
};
|
||||
} manager;
|
||||
|
||||
//MemoryPoolManager ;
|
||||
|
||||
MemoryPoolManager::MemoryPoolManager()
|
||||
{
|
||||
int deviceCount = getCudaEnabledDeviceCount();
|
||||
}
|
||||
|
||||
void MemoryPoolManager::Init()
|
||||
{
|
||||
int deviceCount = getCudaEnabledDeviceCount();
|
||||
if (deviceCount > 0)
|
||||
pools_.resize(deviceCount);
|
||||
}
|
||||
@@ -280,7 +288,7 @@ namespace
|
||||
{
|
||||
for (size_t i = 0; i < pools_.size(); ++i)
|
||||
{
|
||||
cudaSetDevice(i);
|
||||
cudaSetDevice(static_cast<int>(i));
|
||||
pools_[i].release();
|
||||
}
|
||||
}
|
||||
@@ -293,7 +301,14 @@ namespace
|
||||
|
||||
MemoryPool* memPool(int deviceId)
|
||||
{
|
||||
static MemoryPoolManager manager;
|
||||
{
|
||||
AutoLock lock(mtx_);
|
||||
if (!memory_pool_manager_initialized)
|
||||
{
|
||||
memory_pool_manager_initialized = true;
|
||||
manager.Init();
|
||||
}
|
||||
}
|
||||
return manager.getPool(deviceId);
|
||||
}
|
||||
}
|
||||
@@ -311,8 +326,10 @@ cv::cuda::StackAllocator::StackAllocator(cudaStream_t stream) : stream_(stream),
|
||||
if (enableMemoryPool)
|
||||
{
|
||||
const int deviceId = getDevice();
|
||||
memStack_ = memPool(deviceId)->getFreeMemStack();
|
||||
|
||||
{
|
||||
AutoLock lock(mtx_);
|
||||
memStack_ = memPool(deviceId)->getFreeMemStack();
|
||||
}
|
||||
DeviceInfo devInfo(deviceId);
|
||||
alignment_ = devInfo.textureAlignment();
|
||||
}
|
||||
|
||||
@@ -190,10 +190,22 @@ void cv::cuda::Stream::enqueueHostCallback(StreamCallback callback, void* userDa
|
||||
#endif
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
bool default_stream_is_initialized;
|
||||
Mutex mtx;
|
||||
Ptr<Stream> default_stream;
|
||||
}
|
||||
|
||||
Stream& cv::cuda::Stream::Null()
|
||||
{
|
||||
static Stream s(Ptr<Impl>(new Impl(0)));
|
||||
return s;
|
||||
AutoLock lock(mtx);
|
||||
if (!default_stream_is_initialized)
|
||||
{
|
||||
default_stream = Ptr<Stream>(new Stream(Ptr<Impl>(new Impl(0))));
|
||||
default_stream_is_initialized = true;
|
||||
}
|
||||
return *default_stream;
|
||||
}
|
||||
|
||||
cv::cuda::Stream::operator bool_type() const
|
||||
|
||||
@@ -1584,6 +1584,24 @@ void line( InputOutputArray _img, Point pt1, Point pt2, const Scalar& color,
|
||||
ThickLine( img, pt1, pt2, buf, thickness, line_type, 3, shift );
|
||||
}
|
||||
|
||||
void arrowedLine(InputOutputArray img, Point pt1, Point pt2, const Scalar& color,
|
||||
int thickness, int line_type, int shift, double tipLength)
|
||||
{
|
||||
const double tipSize = norm(pt1-pt2)*tipLength; // Factor to normalize the size of the tip depending on the length of the arrow
|
||||
|
||||
line(img, pt1, pt2, color, thickness, line_type, shift);
|
||||
|
||||
const double angle = atan2( (double) pt1.y - pt2.y, (double) pt1.x - pt2.x );
|
||||
|
||||
Point p(cvRound(pt2.x + tipSize * cos(angle + CV_PI / 4)),
|
||||
cvRound(pt2.y + tipSize * sin(angle + CV_PI / 4)));
|
||||
line(img, p, pt2, color, thickness, line_type, shift);
|
||||
|
||||
p.x = cvRound(pt2.x + tipSize * cos(angle - CV_PI / 4));
|
||||
p.y = cvRound(pt2.y + tipSize * sin(angle - CV_PI / 4));
|
||||
line(img, p, pt2, color, thickness, line_type, shift);
|
||||
}
|
||||
|
||||
void rectangle( InputOutputArray _img, Point pt1, Point pt2,
|
||||
const Scalar& color, int thickness,
|
||||
int lineType, int shift )
|
||||
|
||||
@@ -43,6 +43,7 @@
|
||||
#include "opencv2/core/opencl/runtime/opencl_clamdfft.hpp"
|
||||
#include "opencv2/core/opencl/runtime/opencl_core.hpp"
|
||||
#include "opencl_kernels.hpp"
|
||||
#include <map>
|
||||
|
||||
namespace cv
|
||||
{
|
||||
@@ -1781,6 +1782,375 @@ static bool ippi_DFT_R_32F(const Mat& src, Mat& dst, bool inv, int norm_flag)
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef HAVE_OPENCL
|
||||
|
||||
namespace cv
|
||||
{
|
||||
|
||||
enum FftType
|
||||
{
|
||||
R2R = 0, // real to CCS in case forward transform, CCS to real otherwise
|
||||
C2R = 1, // complex to real in case inverse transform
|
||||
R2C = 2, // real to complex in case forward transform
|
||||
C2C = 3 // complex to complex
|
||||
};
|
||||
|
||||
struct OCL_FftPlan
|
||||
{
|
||||
private:
|
||||
UMat twiddles;
|
||||
String buildOptions;
|
||||
int thread_count;
|
||||
bool status;
|
||||
int dft_size;
|
||||
|
||||
public:
|
||||
OCL_FftPlan(int _size): dft_size(_size), status(true)
|
||||
{
|
||||
int min_radix;
|
||||
std::vector<int> radixes, blocks;
|
||||
ocl_getRadixes(dft_size, radixes, blocks, min_radix);
|
||||
thread_count = dft_size / min_radix;
|
||||
|
||||
if (thread_count > (int) ocl::Device::getDefault().maxWorkGroupSize())
|
||||
{
|
||||
status = false;
|
||||
return;
|
||||
}
|
||||
|
||||
// generate string with radix calls
|
||||
String radix_processing;
|
||||
int n = 1, twiddle_size = 0;
|
||||
for (size_t i=0; i<radixes.size(); i++)
|
||||
{
|
||||
int radix = radixes[i], block = blocks[i];
|
||||
if (block > 1)
|
||||
radix_processing += format("fft_radix%d_B%d(smem,twiddles+%d,ind,%d,%d);", radix, block, twiddle_size, n, dft_size/radix);
|
||||
else
|
||||
radix_processing += format("fft_radix%d(smem,twiddles+%d,ind,%d,%d);", radix, twiddle_size, n, dft_size/radix);
|
||||
twiddle_size += (radix-1)*n;
|
||||
n *= radix;
|
||||
}
|
||||
|
||||
Mat tw(1, twiddle_size, CV_32FC2);
|
||||
float* ptr = tw.ptr<float>();
|
||||
int ptr_index = 0;
|
||||
|
||||
n = 1;
|
||||
for (size_t i=0; i<radixes.size(); i++)
|
||||
{
|
||||
int radix = radixes[i];
|
||||
n *= radix;
|
||||
|
||||
for (int j=1; j<radix; j++)
|
||||
{
|
||||
double theta = -CV_2PI*j/n;
|
||||
|
||||
for (int k=0; k<(n/radix); k++)
|
||||
{
|
||||
ptr[ptr_index++] = (float) cos(k*theta);
|
||||
ptr[ptr_index++] = (float) sin(k*theta);
|
||||
}
|
||||
}
|
||||
}
|
||||
twiddles = tw.getUMat(ACCESS_READ);
|
||||
|
||||
buildOptions = format("-D LOCAL_SIZE=%d -D kercn=%d -D RADIX_PROCESS=%s",
|
||||
dft_size, min_radix, radix_processing.c_str());
|
||||
}
|
||||
|
||||
bool enqueueTransform(InputArray _src, OutputArray _dst, int num_dfts, int flags, int fftType, bool rows = true) const
|
||||
{
|
||||
if (!status)
|
||||
return false;
|
||||
|
||||
UMat src = _src.getUMat();
|
||||
UMat dst = _dst.getUMat();
|
||||
|
||||
size_t globalsize[2];
|
||||
size_t localsize[2];
|
||||
String kernel_name;
|
||||
|
||||
bool is1d = (flags & DFT_ROWS) != 0 || num_dfts == 1;
|
||||
bool inv = (flags & DFT_INVERSE) != 0;
|
||||
String options = buildOptions;
|
||||
|
||||
if (rows)
|
||||
{
|
||||
globalsize[0] = thread_count; globalsize[1] = src.rows;
|
||||
localsize[0] = thread_count; localsize[1] = 1;
|
||||
kernel_name = !inv ? "fft_multi_radix_rows" : "ifft_multi_radix_rows";
|
||||
if ((is1d || inv) && (flags & DFT_SCALE))
|
||||
options += " -D DFT_SCALE";
|
||||
}
|
||||
else
|
||||
{
|
||||
globalsize[0] = num_dfts; globalsize[1] = thread_count;
|
||||
localsize[0] = 1; localsize[1] = thread_count;
|
||||
kernel_name = !inv ? "fft_multi_radix_cols" : "ifft_multi_radix_cols";
|
||||
if (flags & DFT_SCALE)
|
||||
options += " -D DFT_SCALE";
|
||||
}
|
||||
|
||||
options += src.channels() == 1 ? " -D REAL_INPUT" : " -D COMPLEX_INPUT";
|
||||
options += dst.channels() == 1 ? " -D REAL_OUTPUT" : " -D COMPLEX_OUTPUT";
|
||||
options += is1d ? " -D IS_1D" : "";
|
||||
|
||||
if (!inv)
|
||||
{
|
||||
if ((is1d && src.channels() == 1) || (rows && (fftType == R2R)))
|
||||
options += " -D NO_CONJUGATE";
|
||||
}
|
||||
else
|
||||
{
|
||||
if (rows && (fftType == C2R || fftType == R2R))
|
||||
options += " -D NO_CONJUGATE";
|
||||
if (dst.cols % 2 == 0)
|
||||
options += " -D EVEN";
|
||||
}
|
||||
|
||||
ocl::Kernel k(kernel_name.c_str(), ocl::core::fft_oclsrc, options);
|
||||
if (k.empty())
|
||||
return false;
|
||||
|
||||
k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(twiddles), thread_count, num_dfts);
|
||||
return k.run(2, globalsize, localsize, false);
|
||||
}
|
||||
|
||||
private:
|
||||
static void ocl_getRadixes(int cols, std::vector<int>& radixes, std::vector<int>& blocks, int& min_radix)
|
||||
{
|
||||
int factors[34];
|
||||
int nf = DFTFactorize(cols, factors);
|
||||
|
||||
int n = 1;
|
||||
int factor_index = 0;
|
||||
min_radix = INT_MAX;
|
||||
|
||||
// 2^n transforms
|
||||
if ((factors[factor_index] & 1) == 0)
|
||||
{
|
||||
for( ; n < factors[factor_index];)
|
||||
{
|
||||
int radix = 2, block = 1;
|
||||
if (8*n <= factors[0])
|
||||
radix = 8;
|
||||
else if (4*n <= factors[0])
|
||||
{
|
||||
radix = 4;
|
||||
if (cols % 12 == 0)
|
||||
block = 3;
|
||||
else if (cols % 8 == 0)
|
||||
block = 2;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (cols % 10 == 0)
|
||||
block = 5;
|
||||
else if (cols % 8 == 0)
|
||||
block = 4;
|
||||
else if (cols % 6 == 0)
|
||||
block = 3;
|
||||
else if (cols % 4 == 0)
|
||||
block = 2;
|
||||
}
|
||||
|
||||
radixes.push_back(radix);
|
||||
blocks.push_back(block);
|
||||
min_radix = min(min_radix, block*radix);
|
||||
n *= radix;
|
||||
}
|
||||
factor_index++;
|
||||
}
|
||||
|
||||
// all the other transforms
|
||||
for( ; factor_index < nf; factor_index++)
|
||||
{
|
||||
int radix = factors[factor_index], block = 1;
|
||||
if (radix == 3)
|
||||
{
|
||||
if (cols % 12 == 0)
|
||||
block = 4;
|
||||
else if (cols % 9 == 0)
|
||||
block = 3;
|
||||
else if (cols % 6 == 0)
|
||||
block = 2;
|
||||
}
|
||||
else if (radix == 5)
|
||||
{
|
||||
if (cols % 10 == 0)
|
||||
block = 2;
|
||||
}
|
||||
radixes.push_back(radix);
|
||||
blocks.push_back(block);
|
||||
min_radix = min(min_radix, block*radix);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class OCL_FftPlanCache
|
||||
{
|
||||
public:
|
||||
static OCL_FftPlanCache & getInstance()
|
||||
{
|
||||
static OCL_FftPlanCache planCache;
|
||||
return planCache;
|
||||
}
|
||||
|
||||
Ptr<OCL_FftPlan> getFftPlan(int dft_size)
|
||||
{
|
||||
std::map<int, Ptr<OCL_FftPlan> >::iterator f = planStorage.find(dft_size);
|
||||
if (f != planStorage.end())
|
||||
{
|
||||
return f->second;
|
||||
}
|
||||
else
|
||||
{
|
||||
Ptr<OCL_FftPlan> newPlan = Ptr<OCL_FftPlan>(new OCL_FftPlan(dft_size));
|
||||
planStorage[dft_size] = newPlan;
|
||||
return newPlan;
|
||||
}
|
||||
}
|
||||
|
||||
~OCL_FftPlanCache()
|
||||
{
|
||||
planStorage.clear();
|
||||
}
|
||||
|
||||
protected:
|
||||
OCL_FftPlanCache() :
|
||||
planStorage()
|
||||
{
|
||||
}
|
||||
std::map<int, Ptr<OCL_FftPlan> > planStorage;
|
||||
};
|
||||
|
||||
static bool ocl_dft_rows(InputArray _src, OutputArray _dst, int nonzero_rows, int flags, int fftType)
|
||||
{
|
||||
Ptr<OCL_FftPlan> plan = OCL_FftPlanCache::getInstance().getFftPlan(_src.cols());
|
||||
return plan->enqueueTransform(_src, _dst, nonzero_rows, flags, fftType, true);
|
||||
}
|
||||
|
||||
static bool ocl_dft_cols(InputArray _src, OutputArray _dst, int nonzero_cols, int flags, int fftType)
|
||||
{
|
||||
Ptr<OCL_FftPlan> plan = OCL_FftPlanCache::getInstance().getFftPlan(_src.rows());
|
||||
return plan->enqueueTransform(_src, _dst, nonzero_cols, flags, fftType, false);
|
||||
}
|
||||
|
||||
static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_rows)
|
||||
{
|
||||
int type = _src.type(), cn = CV_MAT_CN(type);
|
||||
Size ssize = _src.size();
|
||||
if ( !(type == CV_32FC1 || type == CV_32FC2) )
|
||||
return false;
|
||||
|
||||
// if is not a multiplication of prime numbers { 2, 3, 5 }
|
||||
if (ssize.area() != getOptimalDFTSize(ssize.area()))
|
||||
return false;
|
||||
|
||||
UMat src = _src.getUMat();
|
||||
int complex_input = cn == 2 ? 1 : 0;
|
||||
int complex_output = (flags & DFT_COMPLEX_OUTPUT) != 0;
|
||||
int real_input = cn == 1 ? 1 : 0;
|
||||
int real_output = (flags & DFT_REAL_OUTPUT) != 0;
|
||||
bool inv = (flags & DFT_INVERSE) != 0 ? 1 : 0;
|
||||
|
||||
if( nonzero_rows <= 0 || nonzero_rows > _src.rows() )
|
||||
nonzero_rows = _src.rows();
|
||||
bool is1d = (flags & DFT_ROWS) != 0 || nonzero_rows == 1;
|
||||
|
||||
// if output format is not specified
|
||||
if (complex_output + real_output == 0)
|
||||
{
|
||||
if (real_input)
|
||||
real_output = 1;
|
||||
else
|
||||
complex_output = 1;
|
||||
}
|
||||
|
||||
FftType fftType = (FftType)(complex_input << 0 | complex_output << 1);
|
||||
|
||||
// Forward Complex to CCS not supported
|
||||
if (fftType == C2R && !inv)
|
||||
fftType = C2C;
|
||||
|
||||
// Inverse CCS to Complex not supported
|
||||
if (fftType == R2C && inv)
|
||||
fftType = R2R;
|
||||
|
||||
UMat output;
|
||||
if (fftType == C2C || fftType == R2C)
|
||||
{
|
||||
// complex output
|
||||
_dst.create(src.size(), CV_32FC2);
|
||||
output = _dst.getUMat();
|
||||
}
|
||||
else
|
||||
{
|
||||
// real output
|
||||
if (is1d)
|
||||
{
|
||||
_dst.create(src.size(), CV_32FC1);
|
||||
output = _dst.getUMat();
|
||||
}
|
||||
else
|
||||
{
|
||||
_dst.create(src.size(), CV_32FC1);
|
||||
output.create(src.size(), CV_32FC2);
|
||||
}
|
||||
}
|
||||
|
||||
if (!inv)
|
||||
{
|
||||
if (!ocl_dft_rows(src, output, nonzero_rows, flags, fftType))
|
||||
return false;
|
||||
|
||||
if (!is1d)
|
||||
{
|
||||
int nonzero_cols = fftType == R2R ? output.cols/2 + 1 : output.cols;
|
||||
if (!ocl_dft_cols(output, _dst, nonzero_cols, flags, fftType))
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (fftType == C2C)
|
||||
{
|
||||
// complex output
|
||||
if (!ocl_dft_rows(src, output, nonzero_rows, flags, fftType))
|
||||
return false;
|
||||
|
||||
if (!is1d)
|
||||
{
|
||||
if (!ocl_dft_cols(output, output, output.cols, flags, fftType))
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (is1d)
|
||||
{
|
||||
if (!ocl_dft_rows(src, output, nonzero_rows, flags, fftType))
|
||||
return false;
|
||||
}
|
||||
else
|
||||
{
|
||||
int nonzero_cols = src.cols/2 + 1;
|
||||
if (!ocl_dft_cols(src, output, nonzero_cols, flags, fftType))
|
||||
return false;
|
||||
|
||||
if (!ocl_dft_rows(output, _dst, nonzero_rows, flags, fftType))
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace cv;
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_CLAMDFFT
|
||||
|
||||
namespace cv {
|
||||
@@ -1791,14 +2161,6 @@ namespace cv {
|
||||
CV_Assert(s == CLFFT_SUCCESS); \
|
||||
}
|
||||
|
||||
enum FftType
|
||||
{
|
||||
R2R = 0, // real to real
|
||||
C2R = 1, // opencl HERMITIAN_INTERLEAVED to real
|
||||
R2C = 2, // real to opencl HERMITIAN_INTERLEAVED
|
||||
C2C = 3 // complex to complex
|
||||
};
|
||||
|
||||
class PlanCache
|
||||
{
|
||||
struct FftPlan
|
||||
@@ -1923,7 +2285,7 @@ public:
|
||||
}
|
||||
|
||||
// no baked plan is found, so let's create a new one
|
||||
FftPlan * newPlan = new FftPlan(dft_size, src_step, dst_step, doubleFP, inplace, flags, fftType);
|
||||
Ptr<FftPlan> newPlan = Ptr<FftPlan>(new FftPlan(dft_size, src_step, dst_step, doubleFP, inplace, flags, fftType));
|
||||
planStorage.push_back(newPlan);
|
||||
|
||||
return newPlan->plHandle;
|
||||
@@ -1931,8 +2293,6 @@ public:
|
||||
|
||||
~PlanCache()
|
||||
{
|
||||
for (std::vector<FftPlan *>::iterator i = planStorage.begin(), end = planStorage.end(); i != end; ++i)
|
||||
delete (*i);
|
||||
planStorage.clear();
|
||||
}
|
||||
|
||||
@@ -1942,7 +2302,7 @@ protected:
|
||||
{
|
||||
}
|
||||
|
||||
std::vector<FftPlan *> planStorage;
|
||||
std::vector<Ptr<FftPlan> > planStorage;
|
||||
};
|
||||
|
||||
extern "C" {
|
||||
@@ -1960,7 +2320,7 @@ static void CL_CALLBACK oclCleanupCallback(cl_event e, cl_int, void *p)
|
||||
|
||||
}
|
||||
|
||||
static bool ocl_dft(InputArray _src, OutputArray _dst, int flags)
|
||||
static bool ocl_dft_amdfft(InputArray _src, OutputArray _dst, int flags)
|
||||
{
|
||||
int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
|
||||
Size ssize = _src.size();
|
||||
@@ -2019,7 +2379,6 @@ static bool ocl_dft(InputArray _src, OutputArray _dst, int flags)
|
||||
|
||||
tmpBuffer.addref();
|
||||
clSetEventCallback(e, CL_COMPLETE, oclCleanupCallback, tmpBuffer.u);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -2034,7 +2393,12 @@ void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows )
|
||||
#ifdef HAVE_CLAMDFFT
|
||||
CV_OCL_RUN(ocl::haveAmdFft() && ocl::Device::getDefault().type() != ocl::Device::TYPE_CPU &&
|
||||
_dst.isUMat() && _src0.dims() <= 2 && nonzero_rows == 0,
|
||||
ocl_dft(_src0, _dst, flags))
|
||||
ocl_dft_amdfft(_src0, _dst, flags))
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_OPENCL
|
||||
CV_OCL_RUN(_dst.isUMat() && _src0.dims() <= 2,
|
||||
ocl_dft(_src0, _dst, flags, nonzero_rows))
|
||||
#endif
|
||||
|
||||
static DFTFunc dft_tbl[6] =
|
||||
@@ -2046,10 +2410,8 @@ void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows )
|
||||
(DFTFunc)RealDFT_64f,
|
||||
(DFTFunc)CCSIDFT_64f
|
||||
};
|
||||
|
||||
AutoBuffer<uchar> buf;
|
||||
void *spec = 0;
|
||||
|
||||
Mat src0 = _src0.getMat(), src = src0;
|
||||
int prev_len = 0, stage = 0;
|
||||
bool inv = (flags & DFT_INVERSE) != 0;
|
||||
@@ -2080,32 +2442,32 @@ void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows )
|
||||
{
|
||||
if ((flags & DFT_ROWS) == 0)
|
||||
{
|
||||
if (!real_transform)
|
||||
if (src.channels() == 2 && !(inv && (flags & DFT_REAL_OUTPUT)))
|
||||
{
|
||||
if (ippi_DFT_C_32F(src,dst, inv, ipp_norm_flag))
|
||||
if (ippi_DFT_C_32F(src, dst, inv, ipp_norm_flag))
|
||||
return;
|
||||
setIppErrorStatus();
|
||||
}
|
||||
else if (inv || !(flags & DFT_COMPLEX_OUTPUT))
|
||||
if (src.channels() == 1 && (inv || !(flags & DFT_COMPLEX_OUTPUT)))
|
||||
{
|
||||
if (ippi_DFT_R_32F(src,dst, inv, ipp_norm_flag))
|
||||
if (ippi_DFT_R_32F(src, dst, inv, ipp_norm_flag))
|
||||
return;
|
||||
setIppErrorStatus();
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (!real_transform)
|
||||
if (src.channels() == 2 && !(inv && (flags & DFT_REAL_OUTPUT)))
|
||||
{
|
||||
ippiDFT_C_Func ippiFunc = inv ? (ippiDFT_C_Func)ippiDFTInv_CToC_32fc_C1R : (ippiDFT_C_Func)ippiDFTFwd_CToC_32fc_C1R;
|
||||
if (Dft_C_IPPLoop(src,dst, IPPDFT_C_Functor(ippiFunc),ipp_norm_flag))
|
||||
if (Dft_C_IPPLoop(src, dst, IPPDFT_C_Functor(ippiFunc),ipp_norm_flag))
|
||||
return;
|
||||
setIppErrorStatus();
|
||||
}
|
||||
else if (inv || !(flags & DFT_COMPLEX_OUTPUT))
|
||||
if (src.channels() == 1 && (inv || !(flags & DFT_COMPLEX_OUTPUT)))
|
||||
{
|
||||
ippiDFT_R_Func ippiFunc = inv ? (ippiDFT_R_Func)ippiDFTInv_PackToR_32f_C1R : (ippiDFT_R_Func)ippiDFTFwd_RToPack_32f_C1R;
|
||||
if (Dft_R_IPPLoop(src,dst, IPPDFT_R_Functor(ippiFunc),ipp_norm_flag))
|
||||
if (Dft_R_IPPLoop(src, dst, IPPDFT_R_Functor(ippiFunc),ipp_norm_flag))
|
||||
return;
|
||||
setIppErrorStatus();
|
||||
}
|
||||
|
||||
@@ -348,7 +348,18 @@ static void InvSqrt_32f(const float* src, float* dst, int len)
|
||||
|
||||
static void InvSqrt_64f(const double* src, double* dst, int len)
|
||||
{
|
||||
for( int i = 0; i < len; i++ )
|
||||
int i = 0;
|
||||
|
||||
#if CV_SSE2
|
||||
if (USE_SSE2)
|
||||
{
|
||||
__m128d v_1 = _mm_set1_pd(1.0);
|
||||
for ( ; i <= len - 2; i += 2)
|
||||
_mm_storeu_pd(dst + i, _mm_div_pd(v_1, _mm_sqrt_pd(_mm_loadu_pd(src + i))));
|
||||
}
|
||||
#endif
|
||||
|
||||
for( ; i < len; i++ )
|
||||
dst[i] = 1/std::sqrt(src[i]);
|
||||
}
|
||||
|
||||
@@ -2543,12 +2554,33 @@ void patchNaNs( InputOutputArray _a, double _val )
|
||||
NAryMatIterator it(arrays, (uchar**)ptrs);
|
||||
size_t len = it.size*a.channels();
|
||||
Cv32suf val;
|
||||
val.f = (float)_val;
|
||||
float fval = (float)_val;
|
||||
val.f = fval;
|
||||
|
||||
#if CV_SSE2
|
||||
__m128i v_mask1 = _mm_set1_epi32(0x7fffffff), v_mask2 = _mm_set1_epi32(0x7f800000);
|
||||
__m128i v_val = _mm_set1_epi32(val.i);
|
||||
#endif
|
||||
|
||||
for( size_t i = 0; i < it.nplanes; i++, ++it )
|
||||
{
|
||||
int* tptr = ptrs[0];
|
||||
for( size_t j = 0; j < len; j++ )
|
||||
size_t j = 0;
|
||||
|
||||
#if CV_SSE2
|
||||
if (USE_SSE2)
|
||||
{
|
||||
for ( ; j < len; j += 4)
|
||||
{
|
||||
__m128i v_src = _mm_loadu_si128((__m128i const *)(tptr + j));
|
||||
__m128i v_cmp_mask = _mm_cmplt_epi32(v_mask2, _mm_and_si128(v_src, v_mask1));
|
||||
__m128i v_res = _mm_or_si128(_mm_andnot_si128(v_cmp_mask, v_src), _mm_and_si128(v_cmp_mask, v_val));
|
||||
_mm_storeu_si128((__m128i *)(tptr + j), v_res);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for( ; j < len; j++ )
|
||||
if( (tptr[j] & 0x7fffffff) > 0x7f800000 )
|
||||
tptr[j] = val.i;
|
||||
}
|
||||
|
||||
@@ -2758,21 +2758,30 @@ namespace cv {
|
||||
|
||||
static bool ocl_setIdentity( InputOutputArray _m, const Scalar& s )
|
||||
{
|
||||
int type = _m.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
|
||||
sctype = CV_MAKE_TYPE(depth, cn == 3 ? 4 : cn),
|
||||
int type = _m.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), kercn = cn;
|
||||
if (cn == 1)
|
||||
{
|
||||
kercn = std::min(ocl::predictOptimalVectorWidth(_m), 4);
|
||||
if (kercn != 4)
|
||||
kercn = 1;
|
||||
}
|
||||
int sctype = CV_MAKE_TYPE(depth, cn == 3 ? 4 : cn),
|
||||
rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1;
|
||||
|
||||
ocl::Kernel k("setIdentity", ocl::core::set_identity_oclsrc,
|
||||
format("-D T=%s -D T1=%s -D cn=%d -D ST=%s", ocl::memopTypeToStr(type),
|
||||
ocl::memopTypeToStr(depth), cn, ocl::memopTypeToStr(sctype)));
|
||||
format("-D T=%s -D T1=%s -D cn=%d -D ST=%s -D kercn=%d -D rowsPerWI=%d",
|
||||
ocl::memopTypeToStr(CV_MAKE_TYPE(depth, kercn)),
|
||||
ocl::memopTypeToStr(depth), cn,
|
||||
ocl::memopTypeToStr(sctype),
|
||||
kercn, rowsPerWI));
|
||||
if (k.empty())
|
||||
return false;
|
||||
|
||||
UMat m = _m.getUMat();
|
||||
k.args(ocl::KernelArg::WriteOnly(m), ocl::KernelArg::Constant(Mat(1, 1, sctype, s)),
|
||||
rowsPerWI);
|
||||
k.args(ocl::KernelArg::WriteOnly(m, cn, kercn),
|
||||
ocl::KernelArg::Constant(Mat(1, 1, sctype, s)));
|
||||
|
||||
size_t globalsize[2] = { m.cols, (m.rows + rowsPerWI - 1) / rowsPerWI };
|
||||
size_t globalsize[2] = { m.cols * cn / kercn, (m.rows + rowsPerWI - 1) / rowsPerWI };
|
||||
return k.run(2, globalsize, NULL, false);
|
||||
}
|
||||
|
||||
@@ -3327,7 +3336,7 @@ static inline void reduceSumC_8u16u16s32f_64f(const cv::Mat& srcmat, cv::Mat& ds
|
||||
stype == CV_32FC3 ? (ippiSumHint)ippiSum_32f_C3R :
|
||||
stype == CV_32FC4 ? (ippiSumHint)ippiSum_32f_C4R : 0;
|
||||
func =
|
||||
sdepth == CV_8U ? (cv::ReduceFunc)cv::reduceC_<uchar, double, cv::OpAdd<double> > :
|
||||
sdepth == CV_8U ? (cv::ReduceFunc)cv::reduceC_<uchar, double, cv::OpAdd<double> > :
|
||||
sdepth == CV_16U ? (cv::ReduceFunc)cv::reduceC_<ushort, double, cv::OpAdd<double> > :
|
||||
sdepth == CV_16S ? (cv::ReduceFunc)cv::reduceC_<short, double, cv::OpAdd<double> > :
|
||||
sdepth == CV_32F ? (cv::ReduceFunc)cv::reduceC_<float, double, cv::OpAdd<double> > : 0;
|
||||
@@ -3441,12 +3450,18 @@ static bool ocl_reduce(InputArray _src, OutputArray _dst,
|
||||
const int min_opt_cols = 128, buf_cols = 32;
|
||||
int sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype),
|
||||
ddepth = CV_MAT_DEPTH(dtype), ddepth0 = ddepth;
|
||||
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0,
|
||||
useOptimized = 1 == dim && _src.cols() > min_opt_cols;
|
||||
const ocl::Device &defDev = ocl::Device::getDefault();
|
||||
bool doubleSupport = defDev.doubleFPConfig() > 0;
|
||||
|
||||
size_t wgs = defDev.maxWorkGroupSize();
|
||||
bool useOptimized = 1 == dim && _src.cols() > min_opt_cols && (wgs >= buf_cols);
|
||||
|
||||
if (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F))
|
||||
return false;
|
||||
|
||||
if ((op == CV_REDUCE_SUM && sdepth == CV_32F) || op == CV_REDUCE_MIN || op == CV_REDUCE_MAX)
|
||||
return false;
|
||||
|
||||
if (op == CV_REDUCE_AVG)
|
||||
{
|
||||
if (sdepth < CV_32S && ddepth < CV_32S)
|
||||
@@ -3455,78 +3470,80 @@ static bool ocl_reduce(InputArray _src, OutputArray _dst,
|
||||
|
||||
const char * const ops[4] = { "OCL_CV_REDUCE_SUM", "OCL_CV_REDUCE_AVG",
|
||||
"OCL_CV_REDUCE_MAX", "OCL_CV_REDUCE_MIN" };
|
||||
char cvt[2][40];
|
||||
|
||||
int wdepth = std::max(ddepth, CV_32F);
|
||||
cv::String build_opt = format("-D %s -D dim=%d -D cn=%d -D ddepth=%d"
|
||||
" -D srcT=%s -D dstT=%s -D dstT0=%s -D convertToWT=%s"
|
||||
" -D convertToDT=%s -D convertToDT0=%s%s",
|
||||
ops[op], dim, cn, ddepth, ocl::typeToStr(useOptimized ? ddepth : sdepth),
|
||||
ocl::typeToStr(ddepth), ocl::typeToStr(ddepth0),
|
||||
ocl::convertTypeStr(ddepth, wdepth, 1, cvt[0]),
|
||||
ocl::convertTypeStr(sdepth, ddepth, 1, cvt[0]),
|
||||
ocl::convertTypeStr(wdepth, ddepth0, 1, cvt[1]),
|
||||
doubleSupport ? " -D DOUBLE_SUPPORT" : "");
|
||||
|
||||
if (useOptimized)
|
||||
{
|
||||
cv::String build_opt_pre = format("-D OP_REDUCE_PRE -D BUF_COLS=%d -D %s -D dim=1"
|
||||
" -D cn=%d -D ddepth=%d -D srcT=%s -D dstT=%s -D convertToDT=%s%s",
|
||||
buf_cols, ops[op], cn, ddepth, ocl::typeToStr(sdepth), ocl::typeToStr(ddepth),
|
||||
ocl::convertTypeStr(sdepth, ddepth, 1, cvt[0]),
|
||||
doubleSupport ? " -D DOUBLE_SUPPORT" : "");
|
||||
ocl::Kernel kpre("reduce_horz_pre", ocl::core::reduce2_oclsrc, build_opt_pre);
|
||||
if (kpre.empty())
|
||||
size_t tileHeight = (size_t)(wgs / buf_cols);
|
||||
if (defDev.isIntel())
|
||||
{
|
||||
static const size_t maxItemInGroupCount = 16;
|
||||
tileHeight = min(tileHeight, defDev.localMemSize() / buf_cols / CV_ELEM_SIZE(CV_MAKETYPE(wdepth, cn)) / maxItemInGroupCount);
|
||||
}
|
||||
char cvt[3][40];
|
||||
cv::String build_opt = format("-D OP_REDUCE_PRE -D BUF_COLS=%d -D TILE_HEIGHT=%d -D %s -D dim=1"
|
||||
" -D cn=%d -D ddepth=%d"
|
||||
" -D srcT=%s -D bufT=%s -D dstT=%s"
|
||||
" -D convertToWT=%s -D convertToBufT=%s -D convertToDT=%s%s",
|
||||
buf_cols, tileHeight, ops[op], cn, ddepth,
|
||||
ocl::typeToStr(sdepth),
|
||||
ocl::typeToStr(ddepth),
|
||||
ocl::typeToStr(ddepth0),
|
||||
ocl::convertTypeStr(ddepth, wdepth, 1, cvt[0]),
|
||||
ocl::convertTypeStr(sdepth, ddepth, 1, cvt[1]),
|
||||
ocl::convertTypeStr(wdepth, ddepth0, 1, cvt[2]),
|
||||
doubleSupport ? " -D DOUBLE_SUPPORT" : "");
|
||||
ocl::Kernel k("reduce_horz_opt", ocl::core::reduce2_oclsrc, build_opt);
|
||||
if (k.empty())
|
||||
return false;
|
||||
|
||||
ocl::Kernel kmain("reduce", ocl::core::reduce2_oclsrc, build_opt);
|
||||
if (kmain.empty())
|
||||
return false;
|
||||
|
||||
UMat src = _src.getUMat();
|
||||
Size dsize(1, src.rows);
|
||||
_dst.create(dsize, dtype);
|
||||
UMat dst = _dst.getUMat();
|
||||
|
||||
UMat buf(src.rows, buf_cols, dst.type());
|
||||
|
||||
kpre.args(ocl::KernelArg::ReadOnly(src),
|
||||
ocl::KernelArg::WriteOnlyNoSize(buf));
|
||||
if (op0 == CV_REDUCE_AVG)
|
||||
k.args(ocl::KernelArg::ReadOnly(src),
|
||||
ocl::KernelArg::WriteOnlyNoSize(dst), 1.0f / src.cols);
|
||||
else
|
||||
k.args(ocl::KernelArg::ReadOnly(src),
|
||||
ocl::KernelArg::WriteOnlyNoSize(dst));
|
||||
|
||||
size_t localSize[2] = { buf_cols, tileHeight};
|
||||
size_t globalSize[2] = { buf_cols, src.rows };
|
||||
if (!kpre.run(2, globalSize, NULL, false))
|
||||
return k.run(2, globalSize, localSize, false);
|
||||
}
|
||||
else
|
||||
{
|
||||
char cvt[2][40];
|
||||
cv::String build_opt = format("-D %s -D dim=%d -D cn=%d -D ddepth=%d"
|
||||
" -D srcT=%s -D dstT=%s -D dstT0=%s -D convertToWT=%s"
|
||||
" -D convertToDT=%s -D convertToDT0=%s%s",
|
||||
ops[op], dim, cn, ddepth, ocl::typeToStr(useOptimized ? ddepth : sdepth),
|
||||
ocl::typeToStr(ddepth), ocl::typeToStr(ddepth0),
|
||||
ocl::convertTypeStr(ddepth, wdepth, 1, cvt[0]),
|
||||
ocl::convertTypeStr(sdepth, ddepth, 1, cvt[0]),
|
||||
ocl::convertTypeStr(wdepth, ddepth0, 1, cvt[1]),
|
||||
doubleSupport ? " -D DOUBLE_SUPPORT" : "");
|
||||
|
||||
ocl::Kernel k("reduce", ocl::core::reduce2_oclsrc, build_opt);
|
||||
if (k.empty())
|
||||
return false;
|
||||
|
||||
UMat src = _src.getUMat();
|
||||
Size dsize(dim == 0 ? src.cols : 1, dim == 0 ? 1 : src.rows);
|
||||
_dst.create(dsize, dtype);
|
||||
UMat dst = _dst.getUMat();
|
||||
|
||||
ocl::KernelArg srcarg = ocl::KernelArg::ReadOnly(src),
|
||||
temparg = ocl::KernelArg::WriteOnlyNoSize(dst);
|
||||
|
||||
if (op0 == CV_REDUCE_AVG)
|
||||
kmain.args(ocl::KernelArg::ReadOnly(buf),
|
||||
ocl::KernelArg::WriteOnlyNoSize(dst), 1.0f / src.cols);
|
||||
k.args(srcarg, temparg, 1.0f / (dim == 0 ? src.rows : src.cols));
|
||||
else
|
||||
kmain.args(ocl::KernelArg::ReadOnly(buf),
|
||||
ocl::KernelArg::WriteOnlyNoSize(dst));
|
||||
k.args(srcarg, temparg);
|
||||
|
||||
globalSize[0] = src.rows;
|
||||
return kmain.run(1, globalSize, NULL, false);
|
||||
size_t globalsize = std::max(dsize.width, dsize.height);
|
||||
return k.run(1, &globalsize, NULL, false);
|
||||
}
|
||||
|
||||
ocl::Kernel k("reduce", ocl::core::reduce2_oclsrc, build_opt);
|
||||
if (k.empty())
|
||||
return false;
|
||||
|
||||
UMat src = _src.getUMat();
|
||||
Size dsize(dim == 0 ? src.cols : 1, dim == 0 ? 1 : src.rows);
|
||||
_dst.create(dsize, dtype);
|
||||
UMat dst = _dst.getUMat();
|
||||
|
||||
ocl::KernelArg srcarg = ocl::KernelArg::ReadOnly(src),
|
||||
temparg = ocl::KernelArg::WriteOnlyNoSize(dst);
|
||||
|
||||
if (op0 == CV_REDUCE_AVG)
|
||||
k.args(srcarg, temparg, 1.0f / (dim == 0 ? src.rows : src.cols));
|
||||
else
|
||||
k.args(srcarg, temparg);
|
||||
|
||||
size_t globalsize = std::max(dsize.width, dsize.height);
|
||||
return k.run(1, &globalsize, NULL, false);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -3494,9 +3494,8 @@ public:
|
||||
OpenCLBufferPoolImpl()
|
||||
: currentReservedSize(0), maxReservedSize(0)
|
||||
{
|
||||
// Note: Buffer pool is disabled by default,
|
||||
// because we didn't receive significant performance improvement
|
||||
maxReservedSize = getConfigurationParameterForSize("OPENCV_OPENCL_BUFFERPOOL_LIMIT", 0);
|
||||
int poolSize = ocl::Device::getDefault().isIntel() ? 1 << 27 : 0;
|
||||
maxReservedSize = getConfigurationParameterForSize("OPENCV_OPENCL_BUFFERPOOL_LIMIT", poolSize);
|
||||
}
|
||||
virtual ~OpenCLBufferPoolImpl()
|
||||
{
|
||||
@@ -3739,6 +3738,7 @@ public:
|
||||
u->handle = clCreateBuffer(ctx_handle, CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE|createFlags,
|
||||
u->size, u->origdata, &retval);
|
||||
tempUMatFlags = UMatData::TEMP_COPIED_UMAT;
|
||||
|
||||
}
|
||||
if(!u->handle || retval != CL_SUCCESS)
|
||||
return false;
|
||||
@@ -3880,6 +3880,7 @@ public:
|
||||
if(u->data && retval == CL_SUCCESS)
|
||||
{
|
||||
u->markHostCopyObsolete(false);
|
||||
u->markDeviceMemMapped(true);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -3908,6 +3909,7 @@ public:
|
||||
if(!u)
|
||||
return;
|
||||
|
||||
|
||||
CV_Assert(u->handle != 0);
|
||||
|
||||
UMatDataAutoLock autolock(u);
|
||||
@@ -3918,8 +3920,10 @@ public:
|
||||
|
||||
cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
|
||||
cl_int retval = 0;
|
||||
if( !u->copyOnMap() && u->data )
|
||||
if( !u->copyOnMap() && u->deviceMemMapped() )
|
||||
{
|
||||
CV_Assert(u->data != NULL);
|
||||
u->markDeviceMemMapped(false);
|
||||
CV_Assert( (retval = clEnqueueUnmapMemObject(q,
|
||||
(cl_mem)u->handle, u->data, 0, 0, 0)) == CL_SUCCESS );
|
||||
CV_OclDbgAssert(clFinish(q) == CL_SUCCESS);
|
||||
|
||||
864
modules/core/src/opencl/fft.cl
Normal file
864
modules/core/src/opencl/fft.cl
Normal file
@@ -0,0 +1,864 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
// Copyright (C) 2014, Itseez, Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
|
||||
#define SQRT_2 0.707106781188f
|
||||
#define sin_120 0.866025403784f
|
||||
#define fft5_2 0.559016994374f
|
||||
#define fft5_3 -0.951056516295f
|
||||
#define fft5_4 -1.538841768587f
|
||||
#define fft5_5 0.363271264002f
|
||||
|
||||
__attribute__((always_inline))
|
||||
float2 mul_float2(float2 a, float2 b) {
|
||||
return (float2)(fma(a.x, b.x, -a.y * b.y), fma(a.x, b.y, a.y * b.x));
|
||||
}
|
||||
|
||||
__attribute__((always_inline))
|
||||
float2 twiddle(float2 a) {
|
||||
return (float2)(a.y, -a.x);
|
||||
}
|
||||
|
||||
__attribute__((always_inline))
|
||||
void butterfly2(float2 a0, float2 a1, __local float2* smem, __global const float2* twiddles,
|
||||
const int x, const int block_size)
|
||||
{
|
||||
const int k = x & (block_size - 1);
|
||||
a1 = mul_float2(twiddles[k], a1);
|
||||
const int dst_ind = (x << 1) - k;
|
||||
|
||||
smem[dst_ind] = a0 + a1;
|
||||
smem[dst_ind+block_size] = a0 - a1;
|
||||
}
|
||||
|
||||
__attribute__((always_inline))
|
||||
void butterfly4(float2 a0, float2 a1, float2 a2, float2 a3, __local float2* smem, __global const float2* twiddles,
|
||||
const int x, const int block_size)
|
||||
{
|
||||
const int k = x & (block_size - 1);
|
||||
a1 = mul_float2(twiddles[k], a1);
|
||||
a2 = mul_float2(twiddles[k + block_size], a2);
|
||||
a3 = mul_float2(twiddles[k + 2*block_size], a3);
|
||||
|
||||
const int dst_ind = ((x - k) << 2) + k;
|
||||
|
||||
float2 b0 = a0 + a2;
|
||||
a2 = a0 - a2;
|
||||
float2 b1 = a1 + a3;
|
||||
a3 = twiddle(a1 - a3);
|
||||
|
||||
smem[dst_ind] = b0 + b1;
|
||||
smem[dst_ind + block_size] = a2 + a3;
|
||||
smem[dst_ind + 2*block_size] = b0 - b1;
|
||||
smem[dst_ind + 3*block_size] = a2 - a3;
|
||||
}
|
||||
|
||||
__attribute__((always_inline))
|
||||
void butterfly3(float2 a0, float2 a1, float2 a2, __local float2* smem, __global const float2* twiddles,
|
||||
const int x, const int block_size)
|
||||
{
|
||||
const int k = x % block_size;
|
||||
a1 = mul_float2(twiddles[k], a1);
|
||||
a2 = mul_float2(twiddles[k+block_size], a2);
|
||||
const int dst_ind = ((x - k) * 3) + k;
|
||||
|
||||
float2 b1 = a1 + a2;
|
||||
a2 = twiddle(sin_120*(a1 - a2));
|
||||
float2 b0 = a0 - (float2)(0.5f)*b1;
|
||||
|
||||
smem[dst_ind] = a0 + b1;
|
||||
smem[dst_ind + block_size] = b0 + a2;
|
||||
smem[dst_ind + 2*block_size] = b0 - a2;
|
||||
}
|
||||
|
||||
__attribute__((always_inline))
|
||||
void butterfly5(float2 a0, float2 a1, float2 a2, float2 a3, float2 a4, __local float2* smem, __global const float2* twiddles,
|
||||
const int x, const int block_size)
|
||||
{
|
||||
const int k = x % block_size;
|
||||
a1 = mul_float2(twiddles[k], a1);
|
||||
a2 = mul_float2(twiddles[k + block_size], a2);
|
||||
a3 = mul_float2(twiddles[k+2*block_size], a3);
|
||||
a4 = mul_float2(twiddles[k+3*block_size], a4);
|
||||
|
||||
const int dst_ind = ((x - k) * 5) + k;
|
||||
__local float2* dst = smem + dst_ind;
|
||||
|
||||
float2 b0, b1, b5;
|
||||
|
||||
b1 = a1 + a4;
|
||||
a1 -= a4;
|
||||
|
||||
a4 = a3 + a2;
|
||||
a3 -= a2;
|
||||
|
||||
a2 = b1 + a4;
|
||||
b0 = a0 - (float2)0.25f * a2;
|
||||
|
||||
b1 = fft5_2 * (b1 - a4);
|
||||
a4 = fft5_3 * (float2)(-a1.y - a3.y, a1.x + a3.x);
|
||||
b5 = (float2)(a4.x - fft5_5 * a1.y, a4.y + fft5_5 * a1.x);
|
||||
|
||||
a4.x += fft5_4 * a3.y;
|
||||
a4.y -= fft5_4 * a3.x;
|
||||
|
||||
a1 = b0 + b1;
|
||||
b0 -= b1;
|
||||
|
||||
dst[0] = a0 + a2;
|
||||
dst[block_size] = a1 + a4;
|
||||
dst[2 * block_size] = b0 + b5;
|
||||
dst[3 * block_size] = b0 - b5;
|
||||
dst[4 * block_size] = a1 - a4;
|
||||
}
|
||||
|
||||
__attribute__((always_inline))
|
||||
void fft_radix2(__local float2* smem, __global const float2* twiddles, const int x, const int block_size, const int t)
|
||||
{
|
||||
float2 a0, a1;
|
||||
|
||||
if (x < t)
|
||||
{
|
||||
a0 = smem[x];
|
||||
a1 = smem[x+t];
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (x < t)
|
||||
butterfly2(a0, a1, smem, twiddles, x, block_size);
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
__attribute__((always_inline))
|
||||
void fft_radix2_B2(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t)
|
||||
{
|
||||
const int x2 = x1 + t/2;
|
||||
float2 a0, a1, a2, a3;
|
||||
|
||||
if (x1 < t/2)
|
||||
{
|
||||
a0 = smem[x1]; a1 = smem[x1+t];
|
||||
a2 = smem[x2]; a3 = smem[x2+t];
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (x1 < t/2)
|
||||
{
|
||||
butterfly2(a0, a1, smem, twiddles, x1, block_size);
|
||||
butterfly2(a2, a3, smem, twiddles, x2, block_size);
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
__attribute__((always_inline))
|
||||
void fft_radix2_B3(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t)
|
||||
{
|
||||
const int x2 = x1 + t/3;
|
||||
const int x3 = x1 + 2*t/3;
|
||||
float2 a0, a1, a2, a3, a4, a5;
|
||||
|
||||
if (x1 < t/3)
|
||||
{
|
||||
a0 = smem[x1]; a1 = smem[x1+t];
|
||||
a2 = smem[x2]; a3 = smem[x2+t];
|
||||
a4 = smem[x3]; a5 = smem[x3+t];
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (x1 < t/3)
|
||||
{
|
||||
butterfly2(a0, a1, smem, twiddles, x1, block_size);
|
||||
butterfly2(a2, a3, smem, twiddles, x2, block_size);
|
||||
butterfly2(a4, a5, smem, twiddles, x3, block_size);
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
__attribute__((always_inline))
|
||||
void fft_radix2_B4(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t)
|
||||
{
|
||||
const int thread_block = t/4;
|
||||
const int x2 = x1 + thread_block;
|
||||
const int x3 = x1 + 2*thread_block;
|
||||
const int x4 = x1 + 3*thread_block;
|
||||
float2 a0, a1, a2, a3, a4, a5, a6, a7;
|
||||
|
||||
if (x1 < t/4)
|
||||
{
|
||||
a0 = smem[x1]; a1 = smem[x1+t];
|
||||
a2 = smem[x2]; a3 = smem[x2+t];
|
||||
a4 = smem[x3]; a5 = smem[x3+t];
|
||||
a6 = smem[x4]; a7 = smem[x4+t];
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (x1 < t/4)
|
||||
{
|
||||
butterfly2(a0, a1, smem, twiddles, x1, block_size);
|
||||
butterfly2(a2, a3, smem, twiddles, x2, block_size);
|
||||
butterfly2(a4, a5, smem, twiddles, x3, block_size);
|
||||
butterfly2(a6, a7, smem, twiddles, x4, block_size);
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
__attribute__((always_inline))
|
||||
void fft_radix2_B5(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t)
|
||||
{
|
||||
const int thread_block = t/5;
|
||||
const int x2 = x1 + thread_block;
|
||||
const int x3 = x1 + 2*thread_block;
|
||||
const int x4 = x1 + 3*thread_block;
|
||||
const int x5 = x1 + 4*thread_block;
|
||||
float2 a0, a1, a2, a3, a4, a5, a6, a7, a8, a9;
|
||||
|
||||
if (x1 < t/5)
|
||||
{
|
||||
a0 = smem[x1]; a1 = smem[x1+t];
|
||||
a2 = smem[x2]; a3 = smem[x2+t];
|
||||
a4 = smem[x3]; a5 = smem[x3+t];
|
||||
a6 = smem[x4]; a7 = smem[x4+t];
|
||||
a8 = smem[x5]; a9 = smem[x5+t];
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (x1 < t/5)
|
||||
{
|
||||
butterfly2(a0, a1, smem, twiddles, x1, block_size);
|
||||
butterfly2(a2, a3, smem, twiddles, x2, block_size);
|
||||
butterfly2(a4, a5, smem, twiddles, x3, block_size);
|
||||
butterfly2(a6, a7, smem, twiddles, x4, block_size);
|
||||
butterfly2(a8, a9, smem, twiddles, x5, block_size);
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
__attribute__((always_inline))
|
||||
void fft_radix4(__local float2* smem, __global const float2* twiddles, const int x, const int block_size, const int t)
|
||||
{
|
||||
float2 a0, a1, a2, a3;
|
||||
|
||||
if (x < t)
|
||||
{
|
||||
a0 = smem[x]; a1 = smem[x+t]; a2 = smem[x+2*t]; a3 = smem[x+3*t];
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (x < t)
|
||||
butterfly4(a0, a1, a2, a3, smem, twiddles, x, block_size);
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
__attribute__((always_inline))
|
||||
void fft_radix4_B2(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t)
|
||||
{
|
||||
const int x2 = x1 + t/2;
|
||||
float2 a0, a1, a2, a3, a4, a5, a6, a7;
|
||||
|
||||
if (x1 < t/2)
|
||||
{
|
||||
a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t]; a3 = smem[x1+3*t];
|
||||
a4 = smem[x2]; a5 = smem[x2+t]; a6 = smem[x2+2*t]; a7 = smem[x2+3*t];
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (x1 < t/2)
|
||||
{
|
||||
butterfly4(a0, a1, a2, a3, smem, twiddles, x1, block_size);
|
||||
butterfly4(a4, a5, a6, a7, smem, twiddles, x2, block_size);
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
__attribute__((always_inline))
|
||||
void fft_radix4_B3(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t)
|
||||
{
|
||||
const int x2 = x1 + t/3;
|
||||
const int x3 = x2 + t/3;
|
||||
float2 a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11;
|
||||
|
||||
if (x1 < t/3)
|
||||
{
|
||||
a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t]; a3 = smem[x1+3*t];
|
||||
a4 = smem[x2]; a5 = smem[x2+t]; a6 = smem[x2+2*t]; a7 = smem[x2+3*t];
|
||||
a8 = smem[x3]; a9 = smem[x3+t]; a10 = smem[x3+2*t]; a11 = smem[x3+3*t];
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (x1 < t/3)
|
||||
{
|
||||
butterfly4(a0, a1, a2, a3, smem, twiddles, x1, block_size);
|
||||
butterfly4(a4, a5, a6, a7, smem, twiddles, x2, block_size);
|
||||
butterfly4(a8, a9, a10, a11, smem, twiddles, x3, block_size);
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
__attribute__((always_inline))
|
||||
void fft_radix8(__local float2* smem, __global const float2* twiddles, const int x, const int block_size, const int t)
|
||||
{
|
||||
const int k = x % block_size;
|
||||
float2 a0, a1, a2, a3, a4, a5, a6, a7;
|
||||
|
||||
if (x < t)
|
||||
{
|
||||
int tw_ind = block_size / 8;
|
||||
|
||||
a0 = smem[x];
|
||||
a1 = mul_float2(twiddles[k], smem[x + t]);
|
||||
a2 = mul_float2(twiddles[k + block_size],smem[x+2*t]);
|
||||
a3 = mul_float2(twiddles[k+2*block_size],smem[x+3*t]);
|
||||
a4 = mul_float2(twiddles[k+3*block_size],smem[x+4*t]);
|
||||
a5 = mul_float2(twiddles[k+4*block_size],smem[x+5*t]);
|
||||
a6 = mul_float2(twiddles[k+5*block_size],smem[x+6*t]);
|
||||
a7 = mul_float2(twiddles[k+6*block_size],smem[x+7*t]);
|
||||
|
||||
float2 b0, b1, b6, b7;
|
||||
|
||||
b0 = a0 + a4;
|
||||
a4 = a0 - a4;
|
||||
b1 = a1 + a5;
|
||||
a5 = a1 - a5;
|
||||
a5 = (float2)(SQRT_2) * (float2)(a5.x + a5.y, -a5.x + a5.y);
|
||||
b6 = twiddle(a2 - a6);
|
||||
a2 = a2 + a6;
|
||||
b7 = a3 - a7;
|
||||
b7 = (float2)(SQRT_2) * (float2)(-b7.x + b7.y, -b7.x - b7.y);
|
||||
a3 = a3 + a7;
|
||||
|
||||
a0 = b0 + a2;
|
||||
a2 = b0 - a2;
|
||||
a1 = b1 + a3;
|
||||
a3 = twiddle(b1 - a3);
|
||||
a6 = a4 - b6;
|
||||
a4 = a4 + b6;
|
||||
a7 = twiddle(a5 - b7);
|
||||
a5 = a5 + b7;
|
||||
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (x < t)
|
||||
{
|
||||
const int dst_ind = ((x - k) << 3) + k;
|
||||
__local float2* dst = smem + dst_ind;
|
||||
|
||||
dst[0] = a0 + a1;
|
||||
dst[block_size] = a4 + a5;
|
||||
dst[2 * block_size] = a2 + a3;
|
||||
dst[3 * block_size] = a6 + a7;
|
||||
dst[4 * block_size] = a0 - a1;
|
||||
dst[5 * block_size] = a4 - a5;
|
||||
dst[6 * block_size] = a2 - a3;
|
||||
dst[7 * block_size] = a6 - a7;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
__attribute__((always_inline))
|
||||
void fft_radix3(__local float2* smem, __global const float2* twiddles, const int x, const int block_size, const int t)
|
||||
{
|
||||
float2 a0, a1, a2;
|
||||
|
||||
if (x < t)
|
||||
{
|
||||
a0 = smem[x]; a1 = smem[x+t]; a2 = smem[x+2*t];
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (x < t)
|
||||
butterfly3(a0, a1, a2, smem, twiddles, x, block_size);
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
__attribute__((always_inline))
|
||||
void fft_radix3_B2(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t)
|
||||
{
|
||||
const int x2 = x1 + t/2;
|
||||
float2 a0, a1, a2, a3, a4, a5;
|
||||
|
||||
if (x1 < t/2)
|
||||
{
|
||||
a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t];
|
||||
a3 = smem[x2]; a4 = smem[x2+t]; a5 = smem[x2+2*t];
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (x1 < t/2)
|
||||
{
|
||||
butterfly3(a0, a1, a2, smem, twiddles, x1, block_size);
|
||||
butterfly3(a3, a4, a5, smem, twiddles, x2, block_size);
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
__attribute__((always_inline))
|
||||
void fft_radix3_B3(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t)
|
||||
{
|
||||
const int x2 = x1 + t/3;
|
||||
const int x3 = x2 + t/3;
|
||||
float2 a0, a1, a2, a3, a4, a5, a6, a7, a8;
|
||||
|
||||
if (x1 < t/2)
|
||||
{
|
||||
a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t];
|
||||
a3 = smem[x2]; a4 = smem[x2+t]; a5 = smem[x2+2*t];
|
||||
a6 = smem[x3]; a7 = smem[x3+t]; a8 = smem[x3+2*t];
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (x1 < t/2)
|
||||
{
|
||||
butterfly3(a0, a1, a2, smem, twiddles, x1, block_size);
|
||||
butterfly3(a3, a4, a5, smem, twiddles, x2, block_size);
|
||||
butterfly3(a6, a7, a8, smem, twiddles, x3, block_size);
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
__attribute__((always_inline))
|
||||
void fft_radix3_B4(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t)
|
||||
{
|
||||
const int thread_block = t/4;
|
||||
const int x2 = x1 + thread_block;
|
||||
const int x3 = x1 + 2*thread_block;
|
||||
const int x4 = x1 + 3*thread_block;
|
||||
float2 a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11;
|
||||
|
||||
if (x1 < t/4)
|
||||
{
|
||||
a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t];
|
||||
a3 = smem[x2]; a4 = smem[x2+t]; a5 = smem[x2+2*t];
|
||||
a6 = smem[x3]; a7 = smem[x3+t]; a8 = smem[x3+2*t];
|
||||
a9 = smem[x4]; a10 = smem[x4+t]; a11 = smem[x4+2*t];
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (x1 < t/4)
|
||||
{
|
||||
butterfly3(a0, a1, a2, smem, twiddles, x1, block_size);
|
||||
butterfly3(a3, a4, a5, smem, twiddles, x2, block_size);
|
||||
butterfly3(a6, a7, a8, smem, twiddles, x3, block_size);
|
||||
butterfly3(a9, a10, a11, smem, twiddles, x4, block_size);
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
__attribute__((always_inline))
|
||||
void fft_radix5(__local float2* smem, __global const float2* twiddles, const int x, const int block_size, const int t)
|
||||
{
|
||||
const int k = x % block_size;
|
||||
float2 a0, a1, a2, a3, a4;
|
||||
|
||||
if (x < t)
|
||||
{
|
||||
a0 = smem[x]; a1 = smem[x + t]; a2 = smem[x+2*t]; a3 = smem[x+3*t]; a4 = smem[x+4*t];
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (x < t)
|
||||
butterfly5(a0, a1, a2, a3, a4, smem, twiddles, x, block_size);
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
__attribute__((always_inline))
|
||||
void fft_radix5_B2(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t)
|
||||
{
|
||||
const int x2 = x1+t/2;
|
||||
float2 a0, a1, a2, a3, a4, a5, a6, a7, a8, a9;
|
||||
|
||||
if (x1 < t/2)
|
||||
{
|
||||
a0 = smem[x1]; a1 = smem[x1 + t]; a2 = smem[x1+2*t]; a3 = smem[x1+3*t]; a4 = smem[x1+4*t];
|
||||
a5 = smem[x2]; a6 = smem[x2 + t]; a7 = smem[x2+2*t]; a8 = smem[x2+3*t]; a9 = smem[x2+4*t];
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (x1 < t/2)
|
||||
{
|
||||
butterfly5(a0, a1, a2, a3, a4, smem, twiddles, x1, block_size);
|
||||
butterfly5(a5, a6, a7, a8, a9, smem, twiddles, x2, block_size);
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
#ifdef DFT_SCALE
|
||||
#define SCALE_VAL(x, scale) x*scale
|
||||
#else
|
||||
#define SCALE_VAL(x, scale) x
|
||||
#endif
|
||||
|
||||
__kernel void fft_multi_radix_rows(__global const uchar* src_ptr, int src_step, int src_offset, int src_rows, int src_cols,
|
||||
__global uchar* dst_ptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
|
||||
__global float2* twiddles_ptr, const int t, const int nz)
|
||||
{
|
||||
const int x = get_global_id(0);
|
||||
const int y = get_group_id(1);
|
||||
const int block_size = LOCAL_SIZE/kercn;
|
||||
if (y < nz)
|
||||
{
|
||||
__local float2 smem[LOCAL_SIZE];
|
||||
__global const float2* twiddles = (__global float2*) twiddles_ptr;
|
||||
const int ind = x;
|
||||
#ifdef IS_1D
|
||||
float scale = 1.f/dst_cols;
|
||||
#else
|
||||
float scale = 1.f/(dst_cols*dst_rows);
|
||||
#endif
|
||||
|
||||
#ifdef COMPLEX_INPUT
|
||||
__global const float2* src = (__global const float2*)(src_ptr + mad24(y, src_step, mad24(x, (int)(sizeof(float)*2), src_offset)));
|
||||
#pragma unroll
|
||||
for (int i=0; i<kercn; i++)
|
||||
smem[x+i*block_size] = src[i*block_size];
|
||||
#else
|
||||
__global const float* src = (__global const float*)(src_ptr + mad24(y, src_step, mad24(x, (int)sizeof(float), src_offset)));
|
||||
#pragma unroll
|
||||
for (int i=0; i<kercn; i++)
|
||||
smem[x+i*block_size] = (float2)(src[i*block_size], 0.f);
|
||||
#endif
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
RADIX_PROCESS;
|
||||
|
||||
#ifdef COMPLEX_OUTPUT
|
||||
#ifdef NO_CONJUGATE
|
||||
// copy result without complex conjugate
|
||||
const int cols = dst_cols/2 + 1;
|
||||
#else
|
||||
const int cols = dst_cols;
|
||||
#endif
|
||||
|
||||
__global float2* dst = (__global float2*)(dst_ptr + mad24(y, dst_step, dst_offset));
|
||||
#pragma unroll
|
||||
for (int i=x; i<cols; i+=block_size)
|
||||
dst[i] = SCALE_VAL(smem[i], scale);
|
||||
#else
|
||||
// pack row to CCS
|
||||
__local float* smem_1cn = (__local float*) smem;
|
||||
__global float* dst = (__global float*)(dst_ptr + mad24(y, dst_step, dst_offset));
|
||||
for (int i=x; i<dst_cols-1; i+=block_size)
|
||||
dst[i+1] = SCALE_VAL(smem_1cn[i+2], scale);
|
||||
if (x == 0)
|
||||
dst[0] = SCALE_VAL(smem_1cn[0], scale);
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
// fill with zero other rows
|
||||
#ifdef COMPLEX_OUTPUT
|
||||
__global float2* dst = (__global float2*)(dst_ptr + mad24(y, dst_step, dst_offset));
|
||||
#else
|
||||
__global float* dst = (__global float*)(dst_ptr + mad24(y, dst_step, dst_offset));
|
||||
#endif
|
||||
#pragma unroll
|
||||
for (int i=x; i<dst_cols; i+=block_size)
|
||||
dst[i] = 0.f;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void fft_multi_radix_cols(__global const uchar* src_ptr, int src_step, int src_offset, int src_rows, int src_cols,
|
||||
__global uchar* dst_ptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
|
||||
__global float2* twiddles_ptr, const int t, const int nz)
|
||||
{
|
||||
const int x = get_group_id(0);
|
||||
const int y = get_global_id(1);
|
||||
|
||||
if (x < nz)
|
||||
{
|
||||
__local float2 smem[LOCAL_SIZE];
|
||||
__global const uchar* src = src_ptr + mad24(y, src_step, mad24(x, (int)(sizeof(float)*2), src_offset));
|
||||
__global const float2* twiddles = (__global float2*) twiddles_ptr;
|
||||
const int ind = y;
|
||||
const int block_size = LOCAL_SIZE/kercn;
|
||||
float scale = 1.f/(dst_rows*dst_cols);
|
||||
|
||||
#pragma unroll
|
||||
for (int i=0; i<kercn; i++)
|
||||
smem[y+i*block_size] = *((__global const float2*)(src + i*block_size*src_step));
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
RADIX_PROCESS;
|
||||
|
||||
#ifdef COMPLEX_OUTPUT
|
||||
__global uchar* dst = dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(float)*2), dst_offset));
|
||||
#pragma unroll
|
||||
for (int i=0; i<kercn; i++)
|
||||
*((__global float2*)(dst + i*block_size*dst_step)) = SCALE_VAL(smem[y + i*block_size], scale);
|
||||
#else
|
||||
if (x == 0)
|
||||
{
|
||||
// pack first column to CCS
|
||||
__local float* smem_1cn = (__local float*) smem;
|
||||
__global uchar* dst = dst_ptr + mad24(y+1, dst_step, dst_offset);
|
||||
for (int i=y; i<dst_rows-1; i+=block_size, dst+=dst_step*block_size)
|
||||
*((__global float*) dst) = SCALE_VAL(smem_1cn[i+2], scale);
|
||||
if (y == 0)
|
||||
*((__global float*) (dst_ptr + dst_offset)) = SCALE_VAL(smem_1cn[0], scale);
|
||||
}
|
||||
else if (x == (dst_cols+1)/2)
|
||||
{
|
||||
// pack last column to CCS (if needed)
|
||||
__local float* smem_1cn = (__local float*) smem;
|
||||
__global uchar* dst = dst_ptr + mad24(dst_cols-1, (int)sizeof(float), mad24(y+1, dst_step, dst_offset));
|
||||
for (int i=y; i<dst_rows-1; i+=block_size, dst+=dst_step*block_size)
|
||||
*((__global float*) dst) = SCALE_VAL(smem_1cn[i+2], scale);
|
||||
if (y == 0)
|
||||
*((__global float*) (dst_ptr + mad24(dst_cols-1, (int)sizeof(float), dst_offset))) = SCALE_VAL(smem_1cn[0], scale);
|
||||
}
|
||||
else
|
||||
{
|
||||
__global uchar* dst = dst_ptr + mad24(x, (int)sizeof(float)*2, mad24(y, dst_step, dst_offset - (int)sizeof(float)));
|
||||
#pragma unroll
|
||||
for (int i=y; i<dst_rows; i+=block_size, dst+=block_size*dst_step)
|
||||
vstore2(SCALE_VAL(smem[i], scale), 0, (__global float*) dst);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void ifft_multi_radix_rows(__global const uchar* src_ptr, int src_step, int src_offset, int src_rows, int src_cols,
|
||||
__global uchar* dst_ptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
|
||||
__global float2* twiddles_ptr, const int t, const int nz)
|
||||
{
|
||||
const int x = get_global_id(0);
|
||||
const int y = get_group_id(1);
|
||||
const int block_size = LOCAL_SIZE/kercn;
|
||||
#ifdef IS_1D
|
||||
const float scale = 1.f/dst_cols;
|
||||
#else
|
||||
const float scale = 1.f/(dst_cols*dst_rows);
|
||||
#endif
|
||||
|
||||
if (y < nz)
|
||||
{
|
||||
__local float2 smem[LOCAL_SIZE];
|
||||
__global const float2* twiddles = (__global float2*) twiddles_ptr;
|
||||
const int ind = x;
|
||||
|
||||
#if defined(COMPLEX_INPUT) && !defined(NO_CONJUGATE)
|
||||
__global const float2* src = (__global const float2*)(src_ptr + mad24(y, src_step, mad24(x, (int)(sizeof(float)*2), src_offset)));
|
||||
#pragma unroll
|
||||
for (int i=0; i<kercn; i++)
|
||||
{
|
||||
smem[x+i*block_size].x = src[i*block_size].x;
|
||||
smem[x+i*block_size].y = -src[i*block_size].y;
|
||||
}
|
||||
#else
|
||||
|
||||
#if !defined(REAL_INPUT) && defined(NO_CONJUGATE)
|
||||
__global const float2* src = (__global const float2*)(src_ptr + mad24(y, src_step, mad24(2, (int)sizeof(float), src_offset)));
|
||||
|
||||
#pragma unroll
|
||||
for (int i=x; i<(LOCAL_SIZE-1)/2; i+=block_size)
|
||||
{
|
||||
smem[i+1].x = src[i].x;
|
||||
smem[i+1].y = -src[i].y;
|
||||
smem[LOCAL_SIZE-i-1] = src[i];
|
||||
}
|
||||
#else
|
||||
|
||||
#pragma unroll
|
||||
for (int i=x; i<(LOCAL_SIZE-1)/2; i+=block_size)
|
||||
{
|
||||
float2 src = vload2(0, (__global const float*)(src_ptr + mad24(y, src_step, mad24(2*i+1, (int)sizeof(float), src_offset))));
|
||||
|
||||
smem[i+1].x = src.x;
|
||||
smem[i+1].y = -src.y;
|
||||
smem[LOCAL_SIZE-i-1] = src;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
if (x==0)
|
||||
{
|
||||
smem[0].x = *(__global const float*)(src_ptr + mad24(y, src_step, src_offset));
|
||||
smem[0].y = 0.f;
|
||||
|
||||
if(LOCAL_SIZE % 2 ==0)
|
||||
{
|
||||
#if !defined(REAL_INPUT) && defined(NO_CONJUGATE)
|
||||
smem[LOCAL_SIZE/2].x = src[LOCAL_SIZE/2-1].x;
|
||||
#else
|
||||
smem[LOCAL_SIZE/2].x = *(__global const float*)(src_ptr + mad24(y, src_step, mad24(LOCAL_SIZE-1, (int)sizeof(float), src_offset)));
|
||||
#endif
|
||||
smem[LOCAL_SIZE/2].y = 0.f;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
RADIX_PROCESS;
|
||||
|
||||
// copy data to dst
|
||||
#ifdef COMPLEX_OUTPUT
|
||||
__global float2* dst = (__global float*)(dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(float)*2), dst_offset)));
|
||||
#pragma unroll
|
||||
for (int i=0; i<kercn; i++)
|
||||
{
|
||||
dst[i*block_size].x = SCALE_VAL(smem[x + i*block_size].x, scale);
|
||||
dst[i*block_size].y = SCALE_VAL(-smem[x + i*block_size].y, scale);
|
||||
}
|
||||
#else
|
||||
__global float* dst = (__global float*)(dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(float)), dst_offset)));
|
||||
#pragma unroll
|
||||
for (int i=0; i<kercn; i++)
|
||||
{
|
||||
dst[i*block_size] = SCALE_VAL(smem[x + i*block_size].x, scale);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
// fill with zero other rows
|
||||
#ifdef COMPLEX_OUTPUT
|
||||
__global float2* dst = (__global float2*)(dst_ptr + mad24(y, dst_step, dst_offset));
|
||||
#else
|
||||
__global float* dst = (__global float*)(dst_ptr + mad24(y, dst_step, dst_offset));
|
||||
#endif
|
||||
#pragma unroll
|
||||
for (int i=x; i<dst_cols; i+=block_size)
|
||||
dst[i] = 0.f;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void ifft_multi_radix_cols(__global const uchar* src_ptr, int src_step, int src_offset, int src_rows, int src_cols,
|
||||
__global uchar* dst_ptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
|
||||
__global float2* twiddles_ptr, const int t, const int nz)
|
||||
{
|
||||
const int x = get_group_id(0);
|
||||
const int y = get_global_id(1);
|
||||
|
||||
#ifdef COMPLEX_INPUT
|
||||
if (x < nz)
|
||||
{
|
||||
__local float2 smem[LOCAL_SIZE];
|
||||
__global const uchar* src = src_ptr + mad24(y, src_step, mad24(x, (int)(sizeof(float)*2), src_offset));
|
||||
__global uchar* dst = dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(float)*2), dst_offset));
|
||||
__global const float2* twiddles = (__global float2*) twiddles_ptr;
|
||||
const int ind = y;
|
||||
const int block_size = LOCAL_SIZE/kercn;
|
||||
|
||||
#pragma unroll
|
||||
for (int i=0; i<kercn; i++)
|
||||
{
|
||||
float2 temp = *((__global const float2*)(src + i*block_size*src_step));
|
||||
smem[y+i*block_size].x = temp.x;
|
||||
smem[y+i*block_size].y = -temp.y;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
RADIX_PROCESS;
|
||||
|
||||
// copy data to dst
|
||||
#pragma unroll
|
||||
for (int i=0; i<kercn; i++)
|
||||
{
|
||||
__global float2* res = (__global float2*)(dst + i*block_size*dst_step);
|
||||
res[0].x = smem[y + i*block_size].x;
|
||||
res[0].y = -smem[y + i*block_size].y;
|
||||
}
|
||||
}
|
||||
#else
|
||||
if (x < nz)
|
||||
{
|
||||
__global const float2* twiddles = (__global float2*) twiddles_ptr;
|
||||
const int ind = y;
|
||||
const int block_size = LOCAL_SIZE/kercn;
|
||||
|
||||
__local float2 smem[LOCAL_SIZE];
|
||||
#ifdef EVEN
|
||||
if (x!=0 && (x!=(nz-1)))
|
||||
#else
|
||||
if (x!=0)
|
||||
#endif
|
||||
{
|
||||
__global const uchar* src = src_ptr + mad24(y, src_step, mad24(2*x-1, (int)sizeof(float), src_offset));
|
||||
#pragma unroll
|
||||
for (int i=0; i<kercn; i++)
|
||||
{
|
||||
float2 temp = vload2(0, (__global const float*)(src + i*block_size*src_step));
|
||||
smem[y+i*block_size].x = temp.x;
|
||||
smem[y+i*block_size].y = -temp.y;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
int ind = x==0 ? 0: 2*x-1;
|
||||
__global const float* src = (__global const float*)(src_ptr + mad24(1, src_step, mad24(ind, (int)sizeof(float), src_offset)));
|
||||
int step = src_step/(int)sizeof(float);
|
||||
|
||||
#pragma unroll
|
||||
for (int i=y; i<(LOCAL_SIZE-1)/2; i+=block_size)
|
||||
{
|
||||
smem[i+1].x = src[2*i*step];
|
||||
smem[i+1].y = -src[(2*i+1)*step];
|
||||
|
||||
smem[LOCAL_SIZE-i-1].x = src[2*i*step];;
|
||||
smem[LOCAL_SIZE-i-1].y = src[(2*i+1)*step];
|
||||
}
|
||||
if (y==0)
|
||||
{
|
||||
smem[0].x = *(__global const float*)(src_ptr + mad24(ind, (int)sizeof(float), src_offset));
|
||||
smem[0].y = 0.f;
|
||||
|
||||
if(LOCAL_SIZE % 2 ==0)
|
||||
{
|
||||
smem[LOCAL_SIZE/2].x = src[(LOCAL_SIZE-2)*step];
|
||||
smem[LOCAL_SIZE/2].y = 0.f;
|
||||
}
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
RADIX_PROCESS;
|
||||
|
||||
// copy data to dst
|
||||
__global uchar* dst = dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(float2)), dst_offset));
|
||||
|
||||
#pragma unroll
|
||||
for (int i=0; i<kercn; i++)
|
||||
{
|
||||
__global float2* res = (__global float2*)(dst + i*block_size*dst_step);
|
||||
res[0].x = smem[y + i*block_size].x;
|
||||
res[0].y = -smem[y + i*block_size].y;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -36,114 +36,118 @@
|
||||
|
||||
#if lcn == 1
|
||||
#if dcn == 4
|
||||
#define LUT_OP(num)\
|
||||
int idx = *(__global const int *)(srcptr + mad24(num, src_step, src_index));\
|
||||
dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
|
||||
dst[0] = lut_l[idx & 0xff];\
|
||||
dst[1] = lut_l[(idx >> 8) & 0xff];\
|
||||
dst[2] = lut_l[(idx >> 16) & 0xff];\
|
||||
#define LUT_OP \
|
||||
int idx = *(__global const int *)(srcptr + src_index); \
|
||||
dst = (__global dstT *)(dstptr + dst_index); \
|
||||
dst[0] = lut_l[idx & 0xff]; \
|
||||
dst[1] = lut_l[(idx >> 8) & 0xff]; \
|
||||
dst[2] = lut_l[(idx >> 16) & 0xff]; \
|
||||
dst[3] = lut_l[(idx >> 24) & 0xff];
|
||||
#elif dcn == 3
|
||||
#define LUT_OP(num)\
|
||||
uchar3 idx = vload3(0, srcptr + mad24(num, src_step, src_index));\
|
||||
dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
|
||||
dst[0] = lut_l[idx.x];\
|
||||
dst[1] = lut_l[idx.y];\
|
||||
#define LUT_OP \
|
||||
uchar3 idx = vload3(0, srcptr + src_index); \
|
||||
dst = (__global dstT *)(dstptr + dst_index); \
|
||||
dst[0] = lut_l[idx.x]; \
|
||||
dst[1] = lut_l[idx.y]; \
|
||||
dst[2] = lut_l[idx.z];
|
||||
#elif dcn == 2
|
||||
#define LUT_OP(num)\
|
||||
short idx = *(__global const short *)(srcptr + mad24(num, src_step, src_index));\
|
||||
dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
|
||||
dst[0] = lut_l[idx & 0xff];\
|
||||
#define LUT_OP \
|
||||
short idx = *(__global const short *)(srcptr + src_index); \
|
||||
dst = (__global dstT *)(dstptr + dst_index); \
|
||||
dst[0] = lut_l[idx & 0xff]; \
|
||||
dst[1] = lut_l[(idx >> 8) & 0xff];
|
||||
#elif dcn == 1
|
||||
#define LUT_OP(num)\
|
||||
uchar idx = (srcptr + mad24(num, src_step, src_index))[0];\
|
||||
dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
|
||||
#define LUT_OP \
|
||||
uchar idx = (srcptr + src_index)[0]; \
|
||||
dst = (__global dstT *)(dstptr + dst_index); \
|
||||
dst[0] = lut_l[idx];
|
||||
#else
|
||||
#define LUT_OP(num)\
|
||||
__global const srcT * src = (__global const srcT *)(srcptr + mad24(num, src_step, src_index));\
|
||||
dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
|
||||
for (int cn = 0; cn < dcn; ++cn)\
|
||||
#define LUT_OP \
|
||||
__global const srcT * src = (__global const srcT *)(srcptr + src_index); \
|
||||
dst = (__global dstT *)(dstptr + dst_index); \
|
||||
for (int cn = 0; cn < dcn; ++cn) \
|
||||
dst[cn] = lut_l[src[cn]];
|
||||
#endif
|
||||
#else
|
||||
#if dcn == 4
|
||||
#define LUT_OP(num)\
|
||||
__global const uchar4 *src_pixel = (__global const uchar4 *)(srcptr + mad24(num, src_step, src_index));\
|
||||
int4 idx = convert_int4(src_pixel[0]) * lcn + (int4)(0, 1, 2, 3);\
|
||||
dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
|
||||
dst[0] = lut_l[idx.x];\
|
||||
dst[1] = lut_l[idx.y];\
|
||||
dst[2] = lut_l[idx.z];\
|
||||
#define LUT_OP \
|
||||
__global const uchar4 * src_pixel = (__global const uchar4 *)(srcptr + src_index); \
|
||||
int4 idx = mad24(convert_int4(src_pixel[0]), (int4)(lcn), (int4)(0, 1, 2, 3)); \
|
||||
dst = (__global dstT *)(dstptr + dst_index); \
|
||||
dst[0] = lut_l[idx.x]; \
|
||||
dst[1] = lut_l[idx.y]; \
|
||||
dst[2] = lut_l[idx.z]; \
|
||||
dst[3] = lut_l[idx.w];
|
||||
#elif dcn == 3
|
||||
#define LUT_OP(num)\
|
||||
uchar3 src_pixel = vload3(0, srcptr + mad24(num, src_step, src_index));\
|
||||
int3 idx = convert_int3(src_pixel) * lcn + (int3)(0, 1, 2);\
|
||||
dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
|
||||
dst[0] = lut_l[idx.x];\
|
||||
dst[1] = lut_l[idx.y];\
|
||||
#define LUT_OP \
|
||||
uchar3 src_pixel = vload3(0, srcptr + src_index); \
|
||||
int3 idx = mad24(convert_int3(src_pixel), (int3)(lcn), (int3)(0, 1, 2)); \
|
||||
dst = (__global dstT *)(dstptr + dst_index); \
|
||||
dst[0] = lut_l[idx.x]; \
|
||||
dst[1] = lut_l[idx.y]; \
|
||||
dst[2] = lut_l[idx.z];
|
||||
#elif dcn == 2
|
||||
#define LUT_OP(num)\
|
||||
__global const uchar2 *src_pixel = (__global const uchar2 *)(srcptr + mad24(num, src_step, src_index));\
|
||||
int2 idx = convert_int2(src_pixel[0]) * lcn + (int2)(0, 1);\
|
||||
dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
|
||||
dst[0] = lut_l[idx.x];\
|
||||
#define LUT_OP \
|
||||
__global const uchar2 * src_pixel = (__global const uchar2 *)(srcptr + src_index); \
|
||||
int2 idx = mad24(convert_int2(src_pixel[0]), lcn, (int2)(0, 1)); \
|
||||
dst = (__global dstT *)(dstptr + dst_index); \
|
||||
dst[0] = lut_l[idx.x]; \
|
||||
dst[1] = lut_l[idx.y];
|
||||
#elif dcn == 1 //error case (1 < lcn) ==> lcn == scn == dcn
|
||||
#define LUT_OP(num)\
|
||||
uchar idx = (srcptr + mad24(num, src_step, src_index))[0];\
|
||||
dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
|
||||
#define LUT_OP \
|
||||
uchar idx = (srcptr + src_index)[0]; \
|
||||
dst = (__global dstT *)(dstptr + dst_index); \
|
||||
dst[0] = lut_l[idx];
|
||||
#else
|
||||
#define LUT_OP(num)\
|
||||
__global const srcT *src = (__global const srcT *)(srcptr + mad24(num, src_step, src_index));\
|
||||
dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
|
||||
for (int cn = 0; cn < dcn; ++cn)\
|
||||
#define LUT_OP \
|
||||
__global const srcT * src = (__global const srcT *)(srcptr + src_index); \
|
||||
dst = (__global dstT *)(dstptr + dst_index); \
|
||||
for (int cn = 0; cn < dcn; ++cn) \
|
||||
dst[cn] = lut_l[mad24(src[cn], lcn, cn)];
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define LOCAL_LUT_INIT\
|
||||
{\
|
||||
__global const dstT * lut = (__global const dstT *)(lutptr + lut_offset);\
|
||||
int init = mad24((int)get_local_id(1), (int)get_local_size(0), (int)get_local_id(0));\
|
||||
int step = get_local_size(0) * get_local_size(1);\
|
||||
for (int i = init; i < 256 * lcn; i += step)\
|
||||
{\
|
||||
lut_l[i] = lut[i];\
|
||||
}\
|
||||
barrier(CLK_LOCAL_MEM_FENCE);\
|
||||
}
|
||||
|
||||
__kernel void LUT(__global const uchar * srcptr, int src_step, int src_offset,
|
||||
__global const uchar * lutptr, int lut_step, int lut_offset,
|
||||
__global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols)
|
||||
{
|
||||
__local dstT lut_l[256 * lcn];
|
||||
LOCAL_LUT_INIT;
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = 4 * get_global_id(1);
|
||||
int y = get_global_id(1) << 2;
|
||||
|
||||
__local dstT lut_l[256 * lcn];
|
||||
__global const dstT * lut = (__global const dstT *)(lutptr + lut_offset);
|
||||
|
||||
for (int i = mad24((int)get_local_id(1), (int)get_local_size(0), (int)get_local_id(0)),
|
||||
step = get_local_size(0) * get_local_size(1); i < 256 * lcn; i += step)
|
||||
lut_l[i] = lut[i];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src_index = mad24(y, src_step, mad24(x, (int)sizeof(srcT) * dcn, src_offset));
|
||||
int dst_index = mad24(y, dst_step, mad24(x, (int)sizeof(dstT) * dcn, dst_offset));
|
||||
|
||||
__global dstT * dst;
|
||||
LUT_OP(0);
|
||||
|
||||
LUT_OP;
|
||||
|
||||
if (y < rows - 1)
|
||||
{
|
||||
LUT_OP(1);
|
||||
src_index += src_step;
|
||||
dst_index += dst_step;
|
||||
LUT_OP;
|
||||
|
||||
if (y < rows - 2)
|
||||
{
|
||||
LUT_OP(2);
|
||||
src_index += src_step;
|
||||
dst_index += dst_step;
|
||||
LUT_OP;
|
||||
|
||||
if (y < rows - 3)
|
||||
{
|
||||
LUT_OP(3);
|
||||
src_index += src_step;
|
||||
dst_index += dst_step;
|
||||
LUT_OP;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -42,9 +42,13 @@
|
||||
#if wdepth <= 4
|
||||
#define MIN_ABS(a) convertFromU(abs(a))
|
||||
#define MIN_ABS2(a, b) convertFromU(abs_diff(a, b))
|
||||
#define MIN(a, b) min(a, b)
|
||||
#define MAX(a, b) max(a, b)
|
||||
#else
|
||||
#define MIN_ABS(a) fabs(a)
|
||||
#define MIN_ABS2(a, b) fabs(a - b)
|
||||
#define MIN(a, b) fmin(a, b)
|
||||
#define MAX(a, b) fmax(a, b)
|
||||
#endif
|
||||
|
||||
#if kercn != 3
|
||||
@@ -60,44 +64,41 @@
|
||||
#define srcTSIZE (int)sizeof(srcT1)
|
||||
#endif
|
||||
|
||||
#ifdef NEED_MINLOC
|
||||
#define CALC_MINLOC(inc) minloc = id + inc
|
||||
#else
|
||||
#define CALC_MINLOC(inc)
|
||||
#endif
|
||||
|
||||
#ifdef NEED_MAXLOC
|
||||
#define CALC_MAXLOC(inc) maxloc = id + inc
|
||||
#else
|
||||
#define CALC_MAXLOC(inc)
|
||||
#endif
|
||||
|
||||
#ifdef NEED_MINVAL
|
||||
#ifdef NEED_MINLOC
|
||||
#define CALC_MIN(p, inc) \
|
||||
if (minval > temp.p) \
|
||||
{ \
|
||||
minval = temp.p; \
|
||||
CALC_MINLOC(inc); \
|
||||
minloc = id + inc; \
|
||||
}
|
||||
#else
|
||||
#define CALC_MIN(p, inc) \
|
||||
minval = MIN(minval, temp.p);
|
||||
#endif
|
||||
#else
|
||||
#define CALC_MIN(p, inc)
|
||||
#endif
|
||||
|
||||
#ifdef NEED_MAXVAL
|
||||
#ifdef NEED_MAXLOC
|
||||
#define CALC_MAX(p, inc) \
|
||||
if (maxval < temp.p) \
|
||||
{ \
|
||||
maxval = temp.p; \
|
||||
CALC_MAXLOC(inc); \
|
||||
maxloc = id + inc; \
|
||||
}
|
||||
#else
|
||||
#define CALC_MAX(p, inc) \
|
||||
maxval = MAX(maxval, temp.p);
|
||||
#endif
|
||||
#else
|
||||
#define CALC_MAX(p, inc)
|
||||
#endif
|
||||
|
||||
#ifdef OP_CALC2
|
||||
#define CALC_MAX2(p) \
|
||||
if (maxval2 < temp.p) \
|
||||
maxval2 = temp.p;
|
||||
maxval2 = MAX(maxval2, temp.p);
|
||||
#else
|
||||
#define CALC_MAX2(p)
|
||||
#endif
|
||||
@@ -208,25 +209,28 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
|
||||
|
||||
#if kercn == 1
|
||||
#ifdef NEED_MINVAL
|
||||
#ifdef NEED_MINLOC
|
||||
if (minval > temp)
|
||||
{
|
||||
minval = temp;
|
||||
#ifdef NEED_MINLOC
|
||||
minloc = id;
|
||||
#endif
|
||||
}
|
||||
#else
|
||||
minval = MIN(minval, temp);
|
||||
#endif
|
||||
#endif
|
||||
#ifdef NEED_MAXVAL
|
||||
#ifdef NEED_MAXLOC
|
||||
if (maxval < temp)
|
||||
{
|
||||
maxval = temp;
|
||||
#ifdef NEED_MAXLOC
|
||||
maxloc = id;
|
||||
#endif
|
||||
}
|
||||
#else
|
||||
maxval = MAX(maxval, temp);
|
||||
#endif
|
||||
#ifdef OP_CALC2
|
||||
if (maxval2 < temp2)
|
||||
maxval2 = temp2;
|
||||
maxval2 = MAX(maxval2, temp2);
|
||||
#endif
|
||||
#endif
|
||||
#elif kercn >= 2
|
||||
@@ -282,32 +286,35 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
|
||||
{
|
||||
int lid3 = lid - WGS2_ALIGNED;
|
||||
#ifdef NEED_MINVAL
|
||||
#ifdef NEED_MINLOC
|
||||
if (localmem_min[lid3] >= minval)
|
||||
{
|
||||
#ifdef NEED_MINLOC
|
||||
if (localmem_min[lid3] == minval)
|
||||
localmem_minloc[lid3] = min(localmem_minloc[lid3], minloc);
|
||||
else
|
||||
localmem_minloc[lid3] = minloc,
|
||||
#endif
|
||||
localmem_min[lid3] = minval;
|
||||
localmem_min[lid3] = minval;
|
||||
}
|
||||
#else
|
||||
localmem_min[lid3] = MIN(localmem_min[lid3], minval);
|
||||
#endif
|
||||
#endif
|
||||
#ifdef NEED_MAXVAL
|
||||
#ifdef NEED_MAXLOC
|
||||
if (localmem_max[lid3] <= maxval)
|
||||
{
|
||||
#ifdef NEED_MAXLOC
|
||||
if (localmem_max[lid3] == maxval)
|
||||
localmem_maxloc[lid3] = min(localmem_maxloc[lid3], maxloc);
|
||||
else
|
||||
localmem_maxloc[lid3] = maxloc,
|
||||
#endif
|
||||
localmem_max[lid3] = maxval;
|
||||
localmem_max[lid3] = maxval;
|
||||
}
|
||||
#else
|
||||
localmem_max[lid3] = MAX(localmem_max[lid3], maxval);
|
||||
#endif
|
||||
#endif
|
||||
#ifdef OP_CALC2
|
||||
if (localmem_max2[lid3] < maxval2)
|
||||
localmem_max2[lid3] = maxval2;
|
||||
localmem_max2[lid3] = MAX(localmem_max2[lid3], maxval2);
|
||||
#endif
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
@@ -319,32 +326,35 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
|
||||
int lid2 = lsize + lid;
|
||||
|
||||
#ifdef NEED_MINVAL
|
||||
#ifdef NEED_MINLOC
|
||||
if (localmem_min[lid] >= localmem_min[lid2])
|
||||
{
|
||||
#ifdef NEED_MINLOC
|
||||
if (localmem_min[lid] == localmem_min[lid2])
|
||||
localmem_minloc[lid] = min(localmem_minloc[lid2], localmem_minloc[lid]);
|
||||
else
|
||||
localmem_minloc[lid] = localmem_minloc[lid2],
|
||||
#endif
|
||||
localmem_min[lid] = localmem_min[lid2];
|
||||
localmem_min[lid] = localmem_min[lid2];
|
||||
}
|
||||
#else
|
||||
localmem_min[lid] = MIN(localmem_min[lid], localmem_min[lid2]);
|
||||
#endif
|
||||
#endif
|
||||
#ifdef NEED_MAXVAL
|
||||
#ifdef NEED_MAXLOC
|
||||
if (localmem_max[lid] <= localmem_max[lid2])
|
||||
{
|
||||
#ifdef NEED_MAXLOC
|
||||
if (localmem_max[lid] == localmem_max[lid2])
|
||||
localmem_maxloc[lid] = min(localmem_maxloc[lid2], localmem_maxloc[lid]);
|
||||
else
|
||||
localmem_maxloc[lid] = localmem_maxloc[lid2],
|
||||
#endif
|
||||
localmem_max[lid] = localmem_max[lid2];
|
||||
localmem_max[lid] = localmem_max[lid2];
|
||||
}
|
||||
#else
|
||||
localmem_max[lid] = MAX(localmem_max[lid], localmem_max[lid2]);
|
||||
#endif
|
||||
#endif
|
||||
#ifdef OP_CALC2
|
||||
if (localmem_max2[lid] < localmem_max2[lid2])
|
||||
localmem_max2[lid] = localmem_max2[lid2];
|
||||
localmem_max2[lid] = MAX(localmem_max2[lid], localmem_max2[lid2]);
|
||||
#endif
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
@@ -379,7 +379,7 @@
|
||||
#define REDUCE_GLOBAL \
|
||||
dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
|
||||
dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
|
||||
temp = SUM_ABS2(temp, temp2)); \
|
||||
temp = SUM_ABS2(temp, temp2); \
|
||||
FUNC(accumulator, temp.s0); \
|
||||
FUNC(accumulator, temp.s1); \
|
||||
FUNC(accumulator, temp.s2); \
|
||||
|
||||
@@ -81,29 +81,34 @@
|
||||
#define PROCESS_ELEM(acc, value) acc += value
|
||||
#elif defined OCL_CV_REDUCE_MAX
|
||||
#define INIT_VALUE MIN_VAL
|
||||
#define PROCESS_ELEM(acc, value) acc = value > acc ? value : acc
|
||||
#define PROCESS_ELEM(acc, value) acc = max(value, acc)
|
||||
#elif defined OCL_CV_REDUCE_MIN
|
||||
#define INIT_VALUE MAX_VAL
|
||||
#define PROCESS_ELEM(acc, value) acc = value < acc ? value : acc
|
||||
#define PROCESS_ELEM(acc, value) acc = min(value, acc)
|
||||
#else
|
||||
#error "No operation is specified"
|
||||
#endif
|
||||
|
||||
#ifdef OP_REDUCE_PRE
|
||||
|
||||
__kernel void reduce_horz_pre(__global const uchar * srcptr, int src_step, int src_offset, int rows, int cols,
|
||||
__global uchar * bufptr, int buf_step, int buf_offset)
|
||||
__kernel void reduce_horz_opt(__global const uchar * srcptr, int src_step, int src_offset, int rows, int cols,
|
||||
__global uchar * dstptr, int dst_step, int dst_offset
|
||||
#ifdef OCL_CV_REDUCE_AVG
|
||||
, float fscale
|
||||
#endif
|
||||
)
|
||||
{
|
||||
__local bufT lsmem[TILE_HEIGHT][BUF_COLS][cn];
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
if (x < BUF_COLS)
|
||||
int liy = get_local_id(1);
|
||||
if ((x < BUF_COLS) && (y < rows))
|
||||
{
|
||||
int src_index = mad24(y, src_step, mad24(x, (int)sizeof(srcT) * cn, src_offset));
|
||||
int buf_index = mad24(y, buf_step, mad24(x, (int)sizeof(dstT) * cn, buf_offset));
|
||||
|
||||
__global const srcT * src = (__global const srcT *)(srcptr + src_index);
|
||||
__global dstT * buf = (__global dstT *)(bufptr + buf_index);
|
||||
dstT tmp[cn] = { INIT_VALUE };
|
||||
bufT tmp[cn] = { INIT_VALUE };
|
||||
|
||||
int src_step_mul = BUF_COLS * cn;
|
||||
for (int idx = x; idx < cols; idx += BUF_COLS, src += src_step_mul)
|
||||
@@ -111,14 +116,49 @@ __kernel void reduce_horz_pre(__global const uchar * srcptr, int src_step, int s
|
||||
#pragma unroll
|
||||
for (int c = 0; c < cn; ++c)
|
||||
{
|
||||
dstT value = convertToDT(src[c]);
|
||||
bufT value = convertToBufT(src[c]);
|
||||
PROCESS_ELEM(tmp[c], value);
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int c = 0; c < cn; ++c)
|
||||
buf[c] = tmp[c];
|
||||
lsmem[liy][x][c] = tmp[c];
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if ((x < BUF_COLS / 2) && (y < rows))
|
||||
{
|
||||
#pragma unroll
|
||||
for (int c = 0; c < cn; ++c)
|
||||
{
|
||||
PROCESS_ELEM(lsmem[liy][x][c], lsmem[liy][x + BUF_COLS / 2][c]);
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if ((x == 0) && (y < rows))
|
||||
{
|
||||
int dst_index = mad24(y, dst_step, dst_offset);
|
||||
|
||||
__global dstT * dst = (__global dstT *)(dstptr + dst_index);
|
||||
bufT tmp[cn] = { INIT_VALUE };
|
||||
|
||||
#pragma unroll
|
||||
for (int xin = 0; xin < BUF_COLS / 2; xin ++)
|
||||
{
|
||||
#pragma unroll
|
||||
for (int c = 0; c < cn; ++c)
|
||||
{
|
||||
PROCESS_ELEM(tmp[c], lsmem[liy][xin][c]);
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int c = 0; c < cn; ++c)
|
||||
#ifdef OCL_CV_REDUCE_AVG
|
||||
dst[c] = convertToDT(convertToWT(tmp[c]) * fscale);
|
||||
#else
|
||||
dst[c] = convertToDT(tmp[c]);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -43,20 +43,18 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if cn != 3
|
||||
#define loadpix(addr) *(__global const T *)(addr)
|
||||
#if kercn != 3
|
||||
#define storepix(val, addr) *(__global T *)(addr) = val
|
||||
#define TSIZE (int)sizeof(T)
|
||||
#define scalar scalar_
|
||||
#else
|
||||
#define loadpix(addr) vload3(0, (__global const T1 *)(addr))
|
||||
#define storepix(val, addr) vstore3(val, 0, (__global T1 *)(addr))
|
||||
#define TSIZE ((int)sizeof(T1)*3)
|
||||
#define scalar (T)(scalar_.x, scalar_.y, scalar_.z)
|
||||
#endif
|
||||
|
||||
__kernel void setIdentity(__global uchar * srcptr, int src_step, int src_offset, int rows, int cols,
|
||||
ST scalar_, int rowsPerWI)
|
||||
ST scalar_)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y0 = get_global_id(1) * rowsPerWI;
|
||||
@@ -65,7 +63,35 @@ __kernel void setIdentity(__global uchar * srcptr, int src_step, int src_offset,
|
||||
{
|
||||
int src_index = mad24(y0, src_step, mad24(x, TSIZE, src_offset));
|
||||
|
||||
for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, src_index += src_step)
|
||||
storepix(x == y ? scalar : (T)(0), srcptr + src_index);
|
||||
#if kercn == cn
|
||||
#pragma unroll
|
||||
for (int y = y0, i = 0, y1 = min(rows, y0 + rowsPerWI); i < rowsPerWI; ++y, ++i, src_index += src_step)
|
||||
if (y < y1)
|
||||
storepix(x == y ? scalar : (T)(0), srcptr + src_index);
|
||||
#elif kercn == 4 && cn == 1
|
||||
if (y0 < rows)
|
||||
{
|
||||
storepix(x == y0 >> 2 ? (T)(scalar, 0, 0, 0) : (T)(0), srcptr + src_index);
|
||||
if (++y0 < rows)
|
||||
{
|
||||
src_index += src_step;
|
||||
storepix(x == y0 >> 2 ? (T)(0, scalar, 0, 0) : (T)(0), srcptr + src_index);
|
||||
|
||||
if (++y0 < rows)
|
||||
{
|
||||
src_index += src_step;
|
||||
storepix(x == y0 >> 2 ? (T)(0, 0, scalar, 0) : (T)(0), srcptr + src_index);
|
||||
|
||||
if (++y0 < rows)
|
||||
{
|
||||
src_index += src_step;
|
||||
storepix(x == y0 >> 2 ? (T)(0, 0, 0, scalar) : (T)(0), srcptr + src_index);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
#error "Incorrect combination of cn && kercn"
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
@@ -479,9 +479,10 @@ static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask
|
||||
haveMask = _mask.kind() != _InputArray::NONE,
|
||||
haveSrc2 = _src2.kind() != _InputArray::NONE;
|
||||
int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
|
||||
kercn = cn == 1 && !haveMask ? ocl::predictOptimalVectorWidth(_src) : 1,
|
||||
kercn = cn == 1 && !haveMask ? ocl::predictOptimalVectorWidth(_src, _src2) : 1,
|
||||
mcn = std::max(cn, kercn);
|
||||
CV_Assert(!haveSrc2 || _src2.type() == type);
|
||||
int convert_cn = haveSrc2 ? mcn : cn;
|
||||
|
||||
if ( (!doubleSupport && depth == CV_64F) || cn > 4 )
|
||||
return false;
|
||||
@@ -513,7 +514,7 @@ static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask
|
||||
haveMask && _mask.isContinuous() ? " -D HAVE_MASK_CONT" : "", kercn,
|
||||
haveSrc2 ? " -D HAVE_SRC2" : "", calc2 ? " -D OP_CALC2" : "",
|
||||
haveSrc2 && _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : "",
|
||||
depth <= CV_32S && ddepth == CV_32S ? ocl::convertTypeStr(CV_8U, ddepth, mcn, cvt[1]) : "noconvert");
|
||||
depth <= CV_32S && ddepth == CV_32S ? ocl::convertTypeStr(CV_8U, ddepth, convert_cn, cvt[1]) : "noconvert");
|
||||
|
||||
ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, opts);
|
||||
if (k.empty())
|
||||
@@ -918,8 +919,14 @@ static bool ocl_meanStdDev( InputArray _src, OutputArray _mean, OutputArray _sdv
|
||||
int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
|
||||
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0,
|
||||
isContinuous = _src.isContinuous();
|
||||
int groups = ocl::Device::getDefault().maxComputeUnits();
|
||||
size_t wgs = ocl::Device::getDefault().maxWorkGroupSize();
|
||||
const ocl::Device &defDev = ocl::Device::getDefault();
|
||||
int groups = defDev.maxComputeUnits();
|
||||
if (defDev.isIntel())
|
||||
{
|
||||
static const int subSliceEUCount = 10;
|
||||
groups = (groups / subSliceEUCount) * 2;
|
||||
}
|
||||
size_t wgs = defDev.maxWorkGroupSize();
|
||||
|
||||
int ddepth = std::max(CV_32S, depth), sqddepth = std::max(CV_32F, depth),
|
||||
dtype = CV_MAKE_TYPE(ddepth, cn),
|
||||
@@ -1445,6 +1452,9 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int*
|
||||
|
||||
CV_Assert(!haveSrc2 || _src2.type() == type);
|
||||
|
||||
if (depth == CV_32S || depth == CV_32F)
|
||||
return false;
|
||||
|
||||
if ((depth == CV_64F || ddepth == CV_64F) && !doubleSupport)
|
||||
return false;
|
||||
|
||||
@@ -2178,6 +2188,9 @@ static bool ocl_norm( InputArray _src, int normType, InputArray _mask, double &
|
||||
(!doubleSupport && depth == CV_64F))
|
||||
return false;
|
||||
|
||||
if( depth == CV_32F && (!_mask.empty() || normType == NORM_INF) )
|
||||
return false;
|
||||
|
||||
UMat src = _src.getUMat();
|
||||
|
||||
if (normType == NORM_INF)
|
||||
@@ -2270,7 +2283,7 @@ double cv::norm( InputArray _src, int normType, InputArray _mask )
|
||||
|
||||
setIppErrorStatus();
|
||||
}
|
||||
typedef IppStatus (CV_STDCALL* ippiMaskNormFuncC3)(const void *, int, const void *, int, IppiSize, int, Ipp64f *);
|
||||
/*typedef IppStatus (CV_STDCALL* ippiMaskNormFuncC3)(const void *, int, const void *, int, IppiSize, int, Ipp64f *);
|
||||
ippiMaskNormFuncC3 ippFuncC3 =
|
||||
normType == NORM_INF ?
|
||||
(type == CV_8UC3 ? (ippiMaskNormFuncC3)ippiNorm_Inf_8u_C3CMR :
|
||||
@@ -2305,7 +2318,7 @@ double cv::norm( InputArray _src, int normType, InputArray _mask )
|
||||
return normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm;
|
||||
}
|
||||
setIppErrorStatus();
|
||||
}
|
||||
}*/
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -2533,7 +2546,7 @@ static bool ocl_norm( InputArray _src1, InputArray _src2, int normType, InputArr
|
||||
normType &= ~NORM_RELATIVE;
|
||||
bool normsum = normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR;
|
||||
|
||||
if ( !(normType == NORM_INF || normsum) )
|
||||
if ( !normsum || !_mask.empty() )
|
||||
return false;
|
||||
|
||||
if (normsum)
|
||||
@@ -2711,7 +2724,7 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m
|
||||
0) :
|
||||
normType == NORM_L1 ?
|
||||
(type == CV_8UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_8u_C1MR :
|
||||
type == CV_8SC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_8s_C1MR :
|
||||
//type == CV_8SC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_8s_C1MR :
|
||||
type == CV_16UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_16u_C1MR :
|
||||
type == CV_32FC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_32f_C1MR :
|
||||
0) :
|
||||
@@ -2728,7 +2741,7 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m
|
||||
return normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm;
|
||||
setIppErrorStatus();
|
||||
}
|
||||
typedef IppStatus (CV_STDCALL* ippiMaskNormDiffFuncC3)(const void *, int, const void *, int, const void *, int, IppiSize, int, Ipp64f *);
|
||||
/*typedef IppStatus (CV_STDCALL* ippiMaskNormDiffFuncC3)(const void *, int, const void *, int, const void *, int, IppiSize, int, Ipp64f *);
|
||||
ippiMaskNormDiffFuncC3 ippFuncC3 =
|
||||
normType == NORM_INF ?
|
||||
(type == CV_8UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_Inf_8u_C3CMR :
|
||||
@@ -2763,7 +2776,7 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m
|
||||
return normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm;
|
||||
}
|
||||
setIppErrorStatus();
|
||||
}
|
||||
}*/
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
@@ -829,7 +829,7 @@ OCL_TEST_P(Pow, Mat)
|
||||
{
|
||||
static const double pows[] = { -4, -1, -2.5, 0, 1, 2, 3.7, 4 };
|
||||
|
||||
for (int j = 0; j < test_loop_times; j++)
|
||||
for (int j = 0; j < 1/*test_loop_times*/; j++)
|
||||
for (int k = 0, size = sizeof(pows) / sizeof(double); k < size; ++k)
|
||||
{
|
||||
SCOPED_TRACE(pows[k]);
|
||||
@@ -1203,7 +1203,7 @@ OCL_TEST_P(MinMaxIdx_Mask, Mat)
|
||||
|
||||
static bool relativeError(double actual, double expected, double eps)
|
||||
{
|
||||
return std::abs(actual - expected) / actual < eps;
|
||||
return std::abs(actual - expected) < eps*(1 + std::abs(actual));
|
||||
}
|
||||
|
||||
typedef ArithmTestBase Norm;
|
||||
@@ -1230,7 +1230,7 @@ OCL_TEST_P(Norm, NORM_INF_1arg_mask)
|
||||
OCL_OFF(const double cpuRes = cv::norm(src1_roi, NORM_INF, mask_roi));
|
||||
OCL_ON(const double gpuRes = cv::norm(usrc1_roi, NORM_INF, umask_roi));
|
||||
|
||||
EXPECT_NEAR(cpuRes, gpuRes, 0.1);
|
||||
EXPECT_NEAR(cpuRes, gpuRes, 0.2);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1302,7 +1302,7 @@ OCL_TEST_P(Norm, NORM_INF_2args)
|
||||
OCL_OFF(const double cpuRes = cv::norm(src1_roi, src2_roi, type));
|
||||
OCL_ON(const double gpuRes = cv::norm(usrc1_roi, usrc2_roi, type));
|
||||
|
||||
EXPECT_NEAR(cpuRes, gpuRes, 0.1);
|
||||
EXPECT_NEAR(cpuRes, gpuRes, 0.2);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -48,17 +48,26 @@
|
||||
|
||||
#ifdef HAVE_OPENCL
|
||||
|
||||
enum OCL_FFT_TYPE
|
||||
{
|
||||
R2R = 0,
|
||||
C2R = 1,
|
||||
R2C = 2,
|
||||
C2C = 3
|
||||
};
|
||||
|
||||
namespace cvtest {
|
||||
namespace ocl {
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
// Dft
|
||||
|
||||
PARAM_TEST_CASE(Dft, cv::Size, MatDepth, bool, bool, bool, bool)
|
||||
PARAM_TEST_CASE(Dft, cv::Size, OCL_FFT_TYPE, bool, bool, bool, bool)
|
||||
{
|
||||
cv::Size dft_size;
|
||||
int dft_flags, depth;
|
||||
bool inplace;
|
||||
int dft_flags, depth, cn, dft_type;
|
||||
bool hint;
|
||||
bool is1d;
|
||||
|
||||
TEST_DECLARE_INPUT_PARAMETER(src);
|
||||
TEST_DECLARE_OUTPUT_PARAMETER(dst);
|
||||
@@ -66,34 +75,50 @@ PARAM_TEST_CASE(Dft, cv::Size, MatDepth, bool, bool, bool, bool)
|
||||
virtual void SetUp()
|
||||
{
|
||||
dft_size = GET_PARAM(0);
|
||||
depth = GET_PARAM(1);
|
||||
inplace = GET_PARAM(2);
|
||||
dft_type = GET_PARAM(1);
|
||||
depth = CV_32F;
|
||||
|
||||
dft_flags = 0;
|
||||
switch (dft_type)
|
||||
{
|
||||
case R2R: dft_flags |= cv::DFT_REAL_OUTPUT; cn = 1; break;
|
||||
case C2R: dft_flags |= cv::DFT_REAL_OUTPUT; cn = 2; break;
|
||||
case R2C: dft_flags |= cv::DFT_COMPLEX_OUTPUT; cn = 1; break;
|
||||
case C2C: dft_flags |= cv::DFT_COMPLEX_OUTPUT; cn = 2; break;
|
||||
}
|
||||
|
||||
if (GET_PARAM(2))
|
||||
dft_flags |= cv::DFT_INVERSE;
|
||||
if (GET_PARAM(3))
|
||||
dft_flags |= cv::DFT_ROWS;
|
||||
if (GET_PARAM(4))
|
||||
dft_flags |= cv::DFT_SCALE;
|
||||
if (GET_PARAM(5))
|
||||
dft_flags |= cv::DFT_INVERSE;
|
||||
hint = GET_PARAM(5);
|
||||
is1d = (dft_flags & DFT_ROWS) != 0 || dft_size.height == 1;
|
||||
}
|
||||
|
||||
void generateTestData(int cn = 2)
|
||||
void generateTestData()
|
||||
{
|
||||
src = randomMat(dft_size, CV_MAKE_TYPE(depth, cn), 0.0, 100.0);
|
||||
usrc = src.getUMat(ACCESS_READ);
|
||||
|
||||
if (inplace)
|
||||
dst = src, udst = usrc;
|
||||
}
|
||||
};
|
||||
|
||||
OCL_TEST_P(Dft, C2C)
|
||||
OCL_TEST_P(Dft, Mat)
|
||||
{
|
||||
generateTestData();
|
||||
|
||||
OCL_OFF(cv::dft(src, dst, dft_flags | cv::DFT_COMPLEX_OUTPUT));
|
||||
OCL_ON(cv::dft(usrc, udst, dft_flags | cv::DFT_COMPLEX_OUTPUT));
|
||||
int nonzero_rows = hint ? src.cols - randomInt(1, src.rows-1) : 0;
|
||||
OCL_OFF(cv::dft(src, dst, dft_flags, nonzero_rows));
|
||||
OCL_ON(cv::dft(usrc, udst, dft_flags, nonzero_rows));
|
||||
|
||||
// In case forward R2C 1d tranform dst contains only half of output
|
||||
// without complex conjugate
|
||||
if (dft_type == R2C && is1d && (dft_flags & cv::DFT_INVERSE) == 0)
|
||||
{
|
||||
dst = dst(cv::Range(0, dst.rows), cv::Range(0, dst.cols/2 + 1));
|
||||
udst = udst(cv::Range(0, udst.rows), cv::Range(0, udst.cols/2 + 1));
|
||||
}
|
||||
|
||||
double eps = src.size().area() * 1e-4;
|
||||
EXPECT_MAT_NEAR(dst, udst, eps);
|
||||
@@ -150,15 +175,15 @@ OCL_TEST_P(MulSpectrums, Mat)
|
||||
|
||||
OCL_INSTANTIATE_TEST_CASE_P(OCL_ImgProc, MulSpectrums, testing::Combine(Bool(), Bool()));
|
||||
|
||||
OCL_INSTANTIATE_TEST_CASE_P(Core, Dft, Combine(Values(cv::Size(2, 3), cv::Size(5, 4), cv::Size(25, 20),
|
||||
cv::Size(512, 1), cv::Size(1024, 768)),
|
||||
Values(CV_32F, CV_64F),
|
||||
Bool(), // inplace
|
||||
OCL_INSTANTIATE_TEST_CASE_P(Core, Dft, Combine(Values(cv::Size(10, 10), cv::Size(36, 36), cv::Size(512, 1), cv::Size(1280, 768)),
|
||||
Values((OCL_FFT_TYPE) R2C, (OCL_FFT_TYPE) C2C, (OCL_FFT_TYPE) R2R, (OCL_FFT_TYPE) C2R),
|
||||
Bool(), // DFT_INVERSE
|
||||
Bool(), // DFT_ROWS
|
||||
Bool(), // DFT_SCALE
|
||||
Bool()) // DFT_INVERSE
|
||||
Bool() // hint
|
||||
)
|
||||
);
|
||||
|
||||
} } // namespace cvtest::ocl
|
||||
|
||||
#endif // HAVE_OPENCL
|
||||
#endif // HAVE_OPENCL
|
||||
Reference in New Issue
Block a user