From 66ac46214d4bb7685b78d0b3aed4c67b5cdb76a7 Mon Sep 17 00:00:00 2001 From: Alexander Karsakov Date: Wed, 23 Jul 2014 12:13:09 +0400 Subject: [PATCH] Final refactoring, fixes --- modules/core/perf/opencl/perf_arithm.cpp | 2 +- modules/core/perf/opencl/perf_dxt.cpp | 35 +- modules/core/src/dxt.cpp | 747 +++++++++++------------ modules/core/src/ocl.cpp | 8 +- modules/core/src/opencl/fft.cl | 105 ++-- modules/core/test/ocl/test_dft.cpp | 75 +-- 6 files changed, 468 insertions(+), 504 deletions(-) diff --git a/modules/core/perf/opencl/perf_arithm.cpp b/modules/core/perf/opencl/perf_arithm.cpp index ba808b494..17badca76 100644 --- a/modules/core/perf/opencl/perf_arithm.cpp +++ b/modules/core/perf/opencl/perf_arithm.cpp @@ -292,7 +292,7 @@ OCL_PERF_TEST_P(MagnitudeFixture, Magnitude, ::testing::Combine( typedef Size_MatType TransposeFixture; OCL_PERF_TEST_P(TransposeFixture, Transpose, ::testing::Combine( - OCL_TEST_SIZES, Values(CV_8UC1, CV_32FC1, CV_8UC2, CV_32FC2, CV_8UC4, CV_32FC4))) + OCL_TEST_SIZES, OCL_TEST_TYPES_134)) { const Size_MatType_t params = GetParam(); const Size srcSize = get<0>(params); diff --git a/modules/core/perf/opencl/perf_dxt.cpp b/modules/core/perf/opencl/perf_dxt.cpp index 797b2c533..d0219913b 100644 --- a/modules/core/perf/opencl/perf_dxt.cpp +++ b/modules/core/perf/opencl/perf_dxt.cpp @@ -54,40 +54,21 @@ namespace ocl { ///////////// dft //////////////////////// -enum OCL_FFT_TYPE -{ - R2R = 0, // real to real (CCS) - C2R = 1, // complex to real - R2C = 2, // real to complex - C2C = 3 // complex to complex -}; - -typedef tuple DftParams; +typedef tuple DftParams; typedef TestBaseWithParam DftFixture; -OCL_PERF_TEST_P(DftFixture, Dft, ::testing::Combine(Values(C2C, R2R, C2R, R2C), - Values(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3, Size(1024, 1024), Size(512, 512), Size(2048, 2048)), - Values((int) 0, (int)DFT_ROWS, (int)DFT_SCALE/*, (int)DFT_INVERSE, - (int)DFT_INVERSE | DFT_SCALE, (int)DFT_ROWS | DFT_INVERSE*/))) +OCL_PERF_TEST_P(DftFixture, Dft, ::testing::Combine(Values(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3), + Values((int)DFT_ROWS, (int)DFT_SCALE, (int)DFT_INVERSE, + (int)DFT_INVERSE | DFT_SCALE, (int)DFT_ROWS | DFT_INVERSE))) { const DftParams params = GetParam(); - const int dft_type = get<0>(params); - const Size srcSize = get<1>(params); - int flags = get<2>(params); - - int in_cn, out_cn; - switch (dft_type) - { - case R2R: flags |= cv::DFT_REAL_OUTPUT; in_cn = 1; out_cn = 1; break; - case C2R: flags |= cv::DFT_REAL_OUTPUT; in_cn = 2; out_cn = 2; break; - case R2C: flags |= cv::DFT_COMPLEX_OUTPUT; in_cn = 1; out_cn = 2; break; - case C2C: flags |= cv::DFT_COMPLEX_OUTPUT; in_cn = 2; out_cn = 2; break; - } + const Size srcSize = get<0>(params); + const int flags = get<1>(params); - UMat src(srcSize, CV_MAKE_TYPE(CV_32F, in_cn)), dst(srcSize, CV_MAKE_TYPE(CV_32F, out_cn)); + UMat src(srcSize, CV_32FC2), dst(srcSize, CV_32FC2); declare.in(src, WARMUP_RNG).out(dst); - OCL_TEST_CYCLE() cv::dft(src, dst, flags); + OCL_TEST_CYCLE() cv::dft(src, dst, flags | DFT_COMPLEX_OUTPUT); SANITY_CHECK(dst, 1e-3); } diff --git a/modules/core/src/dxt.cpp b/modules/core/src/dxt.cpp index 869409f50..cb0b118bc 100644 --- a/modules/core/src/dxt.cpp +++ b/modules/core/src/dxt.cpp @@ -1781,6 +1781,377 @@ static bool ippi_DFT_R_32F(const Mat& src, Mat& dst, bool inv, int norm_flag) #endif } +#ifdef HAVE_OPENCL + +namespace cv +{ + +enum FftType +{ + R2R = 0, + C2R = 1, + R2C = 2, + C2C = 3 +}; + +static void ocl_getRadixes(int cols, std::vector& radixes, std::vector& blocks, int& min_radix) +{ + int factors[34]; + int nf = DFTFactorize(cols, factors); + + int n = 1; + int factor_index = 0; + min_radix = INT_MAX; + + // 2^n transforms + if ((factors[factor_index] & 1) == 0) + { + for( ; n < factors[factor_index];) + { + int radix = 2, block = 1; + if (8*n <= factors[0]) + radix = 8; + else if (4*n <= factors[0]) + { + radix = 4; + if (cols % 12 == 0) + block = 3; + else if (cols % 8 == 0) + block = 2; + } + else + { + if (cols % 10 == 0) + block = 5; + else if (cols % 8 == 0) + block = 4; + else if (cols % 6 == 0) + block = 3; + else if (cols % 4 == 0) + block = 2; + } + + radixes.push_back(radix); + blocks.push_back(block); + min_radix = min(min_radix, block*radix); + n *= radix; + } + factor_index++; + } + + // all the other transforms + for( ; factor_index < nf; factor_index++) + { + int radix = factors[factor_index], block = 1; + if (radix == 3) + { + if (cols % 12 == 0) + block = 4; + else if (cols % 9 == 0) + block = 3; + else if (cols % 6 == 0) + block = 2; + } + else if (radix == 5) + { + if (cols % 10 == 0) + block = 2; + } + radixes.push_back(radix); + blocks.push_back(block); + min_radix = min(min_radix, block*radix); + } +} + +struct OCL_FftPlan +{ + UMat twiddles; + String buildOptions; + int thread_count; + + int dft_size; + bool status; + OCL_FftPlan(int _size): dft_size(_size), status(true) + { + int min_radix; + std::vector radixes, blocks; + ocl_getRadixes(dft_size, radixes, blocks, min_radix); + thread_count = dft_size / min_radix; + + if (thread_count > (int) ocl::Device::getDefault().maxWorkGroupSize()) + { + status = false; + return; + } + + // generate string with radix calls + String radix_processing; + int n = 1, twiddle_size = 0; + for (size_t i=0; i 1) + radix_processing += format("fft_radix%d_B%d(smem,twiddles+%d,ind,%d,%d);", radix, block, twiddle_size, n, dft_size/radix); + else + radix_processing += format("fft_radix%d(smem,twiddles+%d,ind,%d,%d);", radix, twiddle_size, n, dft_size/radix); + twiddle_size += (radix-1)*n; + n *= radix; + } + + Mat tw(1, twiddle_size, CV_32FC2); + float* ptr = tw.ptr(); + int ptr_index = 0; + + n = 1; + for (size_t i=0; idft_size == dft_size) + { + return plan; + } + } + + OCL_FftPlan * newPlan = new OCL_FftPlan(dft_size); + planStorage.push_back(newPlan); + return newPlan; + } + + ~OCL_FftPlanCache() + { + for (std::vector::iterator i = planStorage.begin(), end = planStorage.end(); i != end; ++i) + delete (*i); + planStorage.clear(); + } + +protected: + OCL_FftPlanCache() : + planStorage() + { + } + + std::vector planStorage; +}; + +static bool ocl_dft_C2C_rows(InputArray _src, OutputArray _dst, int nonzero_rows, int flags, int fftType) +{ + const OCL_FftPlan* plan = OCL_FftPlanCache::getInstance().getFftPlan(_src.cols()); + return plan->enqueueTransform(_src, _dst, nonzero_rows, flags, fftType, true); +} + +static bool ocl_dft_C2C_cols(InputArray _src, OutputArray _dst, int nonzero_cols, int flags, int fftType) +{ + const OCL_FftPlan* plan = OCL_FftPlanCache::getInstance().getFftPlan(_src.rows()); + return plan->enqueueTransform(_src, _dst, nonzero_cols, flags, fftType, false); +} + +static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_rows) +{ + int type = _src.type(), cn = CV_MAT_CN(type); + Size ssize = _src.size(); + if ( !(type == CV_32FC1 || type == CV_32FC2) ) + return false; + + // if is not a multiplication of prime numbers { 2, 3, 5 } + if (ssize.area() != getOptimalDFTSize(ssize.area())) + return false; + + UMat src = _src.getUMat(); + int complex_input = cn == 2 ? 1 : 0; + int complex_output = (flags & DFT_COMPLEX_OUTPUT) != 0; + int real_input = cn == 1 ? 1 : 0; + int real_output = (flags & DFT_REAL_OUTPUT) != 0; + bool inv = (flags & DFT_INVERSE) != 0 ? 1 : 0; + + if( nonzero_rows <= 0 || nonzero_rows > _src.rows() ) + nonzero_rows = _src.rows(); + bool is1d = (flags & DFT_ROWS) != 0 || nonzero_rows == 1; + + // if output format is not specified + if (complex_output + real_output == 0) + { + if (real_input) + real_output = 1; + else + complex_output = 1; + } + + FftType fftType = (FftType)(complex_input << 0 | complex_output << 1); + + // Forward Complex to CCS not supported + if (fftType == C2R && !inv) + fftType = C2C; + + // Inverse CCS to Complex not supported + if (fftType == R2C && inv) + fftType = R2R; + + UMat output; + if (fftType == C2C || fftType == R2C) + { + // complex output + _dst.create(src.size(), CV_32FC2); + output = _dst.getUMat(); + } + else + { + // real output + if (is1d) + { + _dst.create(src.size(), CV_32FC1); + output = _dst.getUMat(); + } + else + { + _dst.create(src.size(), CV_32FC1); + output.create(src.size(), CV_32FC2); + } + } + + if (!inv) + { + if (!ocl_dft_C2C_rows(src, output, nonzero_rows, flags, fftType)) + return false; + + if (!is1d) + { + int nonzero_cols = fftType == R2R ? output.cols/2 + 1 : output.cols; + if (!ocl_dft_C2C_cols(output, _dst, nonzero_cols, flags, fftType)) + return false; + } + } + else + { + if (fftType == C2C) + { + // complex output + if (!ocl_dft_C2C_rows(src, output, nonzero_rows, flags, fftType)) + return false; + + if (!is1d) + { + if (!ocl_dft_C2C_cols(output, output, output.cols, flags, fftType)) + return false; + } + } + else + { + if (is1d) + { + if (!ocl_dft_C2C_rows(src, output, nonzero_rows, flags, fftType)) + return false; + } + else + { + int nonzero_cols = src.cols/2 + 1; + if (!ocl_dft_C2C_cols(src, output, nonzero_cols, flags, fftType)) + return false; + + if (!ocl_dft_C2C_rows(output, _dst, nonzero_rows, flags, fftType)) + return false; + } + } + } + return true; +} + +} // namespace cv; + +#endif + #ifdef HAVE_CLAMDFFT namespace cv { @@ -2011,7 +2382,6 @@ static bool ocl_dft_amdfft(InputArray _src, OutputArray _dst, int flags) tmpBuffer.addref(); clSetEventCallback(e, CL_COMPLETE, oclCleanupCallback, tmpBuffer.u); - return true; } @@ -2021,381 +2391,6 @@ static bool ocl_dft_amdfft(InputArray _src, OutputArray _dst, int flags) #endif // HAVE_CLAMDFFT -namespace cv -{ - -#ifdef HAVE_OPENCL - -enum FftType -{ - R2R = 0, - C2R = 1, - R2C = 2, - C2C = 3 -}; - -static void ocl_getRadixes(int cols, std::vector& radixes, std::vector& blocks, int& min_radix) -{ - int factors[34]; - int nf = DFTFactorize(cols, factors); - - int n = 1; - int factor_index = 0; - min_radix = INT_MAX; - - // 2^n transforms - if ((factors[factor_index] & 1) == 0) - { - for( ; n < factors[factor_index];) - { - int radix = 2, block = 1; - if (8*n <= factors[0]) - radix = 8; - else if (4*n <= factors[0]) - { - radix = 4; - if (cols % 12 == 0) - block = 3; - else if (cols % 8 == 0) - block = 2; - } - else - { - if (cols % 10 == 0) - block = 5; - else if (cols % 8 == 0) - block = 4; - else if (cols % 6 == 0) - block = 3; - else if (cols % 4 == 0) - block = 2; - } - - radixes.push_back(radix); - blocks.push_back(block); - min_radix = min(min_radix, block*radix); - n *= radix; - } - factor_index++; - } - - // all the other transforms - for( ; factor_index < nf; factor_index++) - { - int radix = factors[factor_index], block = 1; - if (radix == 3) - { - if (cols % 12 == 0) - block = 4; - else if (cols % 9 == 0) - block = 3; - else if (cols % 6 == 0) - block = 2; - } - else if (radix == 5) - { - if (cols % 10 == 0) - block = 2; - } - radixes.push_back(radix); - blocks.push_back(block); - min_radix = min(min_radix, block*radix); - } -} - -struct OCL_FftPlan -{ - UMat twiddles; - String buildOptions; - int thread_count; - - int dft_size; - bool status; - OCL_FftPlan(int _size): dft_size(_size), status(true) - { - int min_radix; - std::vector radixes, blocks; - ocl_getRadixes(dft_size, radixes, blocks, min_radix); - thread_count = dft_size / min_radix; - - if (thread_count > ocl::Device::getDefault().maxWorkGroupSize()) - { - status = false; - return; - } - - // generate string with radix calls - String radix_processing; - int n = 1, twiddle_size = 0; - for (size_t i=0; i 1) - radix_processing += format("fft_radix%d_B%d(smem,twiddles+%d,ind,%d,%d);", radix, block, twiddle_size, n, dft_size/radix); - else - radix_processing += format("fft_radix%d(smem,twiddles+%d,ind,%d,%d);", radix, twiddle_size, n, dft_size/radix); - twiddle_size += (radix-1)*n; - n *= radix; - } - - Mat tw(1, twiddle_size, CV_32FC2); - float* ptr = tw.ptr(); - int ptr_index = 0; - - n = 1; - for (size_t i=0; idft_size == dft_size) - { - return plan; - } - } - - OCL_FftPlan * newPlan = new OCL_FftPlan(dft_size); - planStorage.push_back(newPlan); - return newPlan; - } - - ~OCL_FftPlanCache() - { - for (std::vector::iterator i = planStorage.begin(), end = planStorage.end(); i != end; ++i) - delete (*i); - planStorage.clear(); - } - -protected: - OCL_FftPlanCache() : - planStorage() - { - } - - std::vector planStorage; -}; - -static bool ocl_dft_C2C_rows(InputArray _src, OutputArray _dst, int nonzero_rows, int flags, int fftType) -{ - const OCL_FftPlan* plan = OCL_FftPlanCache::getInstance().getFftPlan(_src.cols()); - return plan->enqueueTransform(_src, _dst, nonzero_rows, flags, fftType, true); -} - -static bool ocl_dft_C2C_cols(InputArray _src, OutputArray _dst, int nonzero_cols, int flags, int fftType) -{ - const OCL_FftPlan* plan = OCL_FftPlanCache::getInstance().getFftPlan(_src.rows()); - return plan->enqueueTransform(_src, _dst, nonzero_cols, flags, fftType, false); -} - -static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_rows) -{ - int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); - Size ssize = _src.size(); - bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; - if ( (!doubleSupport && depth == CV_64F) || - !(type == CV_32FC1 || type == CV_32FC2 || type == CV_64FC1 || type == CV_64FC2)) - return false; - - // if is not a multiplication of prime numbers { 2, 3, 5 } - if (ssize.area() != getOptimalDFTSize(ssize.area())) - return false; - - UMat src = _src.getUMat(); - int complex_input = cn == 2 ? 1 : 0; - int complex_output = (flags & DFT_COMPLEX_OUTPUT) != 0; - int real_input = cn == 1 ? 1 : 0; - int real_output = (flags & DFT_REAL_OUTPUT) != 0; - bool inv = (flags & DFT_INVERSE) != 0 ? 1 : 0; - - if( nonzero_rows <= 0 || nonzero_rows > _src.rows() ) - nonzero_rows = _src.rows(); - bool is1d = (flags & DFT_ROWS) != 0 || nonzero_rows == 1; - - // if output format is not specified - if (complex_output + real_output == 0) - { - if (real_input) - real_output = 1; - else - complex_output = 1; - } - - FftType fftType = (FftType)(complex_input << 0 | complex_output << 1); - - // Forward Complex to CCS not supported - if (fftType == C2R && !inv) - fftType = C2C; - - // Inverse CCS to Complex not supported - if (fftType == R2C && inv) - fftType = R2R; - - UMat output; - if (fftType == C2C || fftType == R2C) - { - // complex output - _dst.create(src.size(), CV_32FC2); - output = _dst.getUMat(); - } - else - { - // real output - if (is1d) - { - _dst.create(src.size(), CV_32FC1); - output = _dst.getUMat(); - } - else - { - _dst.create(src.size(), CV_32FC1); - output.create(src.size(), CV_32FC2); - } - } - - if (!inv) - { - if (!ocl_dft_C2C_rows(src, output, nonzero_rows, flags, fftType)) - return false; - - if (!is1d) - { - int nonzero_cols = fftType == R2R ? output.cols/2 + 1 : output.cols; - if (!ocl_dft_C2C_cols(output, _dst, nonzero_cols, flags, fftType)) - return false; - } - } - else - { - if (fftType == C2C) - { - // complex output - if (!ocl_dft_C2C_rows(src, output, nonzero_rows, flags, fftType)) - return false; - - if (!is1d) - { - if (!ocl_dft_C2C_cols(output, output, output.cols, flags, fftType)) - return false; - } - } - else - { - if (is1d) - { - if (!ocl_dft_C2C_rows(src, output, nonzero_rows, flags, fftType)) - return false; - } - else - { - int nonzero_cols = src.cols/2 + 1; - if (!ocl_dft_C2C_cols(src, output, nonzero_cols, flags, fftType)) - return false; - - if (!ocl_dft_C2C_rows(output, _dst, nonzero_rows, flags, fftType)) - return false; - } - } - } - return true; -} - -#endif - -} // namespace cv; - - - void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows ) { #ifdef HAVE_CLAMDFFT diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp index a2110f6cc..32db8c91b 100644 --- a/modules/core/src/ocl.cpp +++ b/modules/core/src/ocl.cpp @@ -3002,8 +3002,7 @@ bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[], sync ? 0 : &p->e); if( sync || retval != CL_SUCCESS ) { - int a = clFinish(qq); - CV_OclDbgAssert(a == CL_SUCCESS); + CV_OclDbgAssert(clFinish(qq) == CL_SUCCESS); p->cleanupUMats(); } else @@ -3899,9 +3898,8 @@ public: if( (accessFlags & ACCESS_READ) != 0 && u->hostCopyObsolete() ) { AlignedDataPtr alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT); - int a = clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0, - u->size, alignedPtr.getAlignedPtr(), 0, 0, 0); - CV_Assert( a == CL_SUCCESS ); + CV_Assert( clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0, + u->size, alignedPtr.getAlignedPtr(), 0, 0, 0) == CL_SUCCESS ); u->markHostCopyObsolete(false); } } diff --git a/modules/core/src/opencl/fft.cl b/modules/core/src/opencl/fft.cl index b8d2c6716..1cb2278c0 100644 --- a/modules/core/src/opencl/fft.cl +++ b/modules/core/src/opencl/fft.cl @@ -6,36 +6,36 @@ #define fft5_5 0.363271264002f __attribute__((always_inline)) -float2 mul_float2(float2 a, float2 b) { - return (float2)(fma(a.x, b.x, -a.y * b.y), fma(a.x, b.y, a.y * b.x)); +float2 mul_float2(float2 a, float2 b) { + return (float2)(fma(a.x, b.x, -a.y * b.y), fma(a.x, b.y, a.y * b.x)); } __attribute__((always_inline)) -float2 twiddle(float2 a) { - return (float2)(a.y, -a.x); +float2 twiddle(float2 a) { + return (float2)(a.y, -a.x); } __attribute__((always_inline)) -void butterfly2(float2 a0, float2 a1, __local float2* smem, __global const float2* twiddles, - const int x, const int block_size) -{ +void butterfly2(float2 a0, float2 a1, __local float2* smem, __global const float2* twiddles, + const int x, const int block_size) +{ const int k = x & (block_size - 1); a1 = mul_float2(twiddles[k], a1); const int dst_ind = (x << 1) - k; - + smem[dst_ind] = a0 + a1; smem[dst_ind+block_size] = a0 - a1; } __attribute__((always_inline)) -void butterfly4(float2 a0, float2 a1, float2 a2, float2 a3, __local float2* smem, __global const float2* twiddles, - const int x, const int block_size) +void butterfly4(float2 a0, float2 a1, float2 a2, float2 a3, __local float2* smem, __global const float2* twiddles, + const int x, const int block_size) { const int k = x & (block_size - 1); a1 = mul_float2(twiddles[k], a1); a2 = mul_float2(twiddles[k + block_size], a2); a3 = mul_float2(twiddles[k + 2*block_size], a3); - + const int dst_ind = ((x - k) << 2) + k; float2 b0 = a0 + a2; @@ -50,9 +50,9 @@ void butterfly4(float2 a0, float2 a1, float2 a2, float2 a3, __local float2* smem } __attribute__((always_inline)) -void butterfly3(float2 a0, float2 a1, float2 a2, __local float2* smem, __global const float2* twiddles, - const int x, const int block_size) -{ +void butterfly3(float2 a0, float2 a1, float2 a2, __local float2* smem, __global const float2* twiddles, + const int x, const int block_size) +{ const int k = x % block_size; a1 = mul_float2(twiddles[k], a1); a2 = mul_float2(twiddles[k+block_size], a2); @@ -69,8 +69,8 @@ void butterfly3(float2 a0, float2 a1, float2 a2, __local float2* smem, __global __attribute__((always_inline)) void butterfly5(float2 a0, float2 a1, float2 a2, float2 a3, float2 a4, __local float2* smem, __global const float2* twiddles, - const int x, const int block_size) -{ + const int x, const int block_size) +{ const int k = x % block_size; a1 = mul_float2(twiddles[k], a1); a2 = mul_float2(twiddles[k + block_size], a2); @@ -95,7 +95,7 @@ void butterfly5(float2 a0, float2 a1, float2 a2, float2 a3, float2 a4, __local f a4 = fft5_3 * (float2)(-a1.y - a3.y, a1.x + a3.x); b5 = (float2)(a4.x - fft5_5 * a1.y, a4.y + fft5_5 * a1.x); - a4.x += fft5_4 * a3.y; + a4.x += fft5_4 * a3.y; a4.y -= fft5_4 * a3.x; a1 = b0 + b1; @@ -109,7 +109,7 @@ void butterfly5(float2 a0, float2 a1, float2 a2, float2 a3, float2 a4, __local f } __attribute__((always_inline)) -void fft_radix2(__local float2* smem, __global const float2* twiddles, const int x, const int block_size, const int t) +void fft_radix2(__local float2* smem, __global const float2* twiddles, const int x, const int block_size, const int t) { float2 a0, a1; @@ -122,13 +122,13 @@ void fft_radix2(__local float2* smem, __global const float2* twiddles, const int barrier(CLK_LOCAL_MEM_FENCE); if (x < t) - butterfly2(a0, a1, smem, twiddles, x, block_size); + butterfly2(a0, a1, smem, twiddles, x, block_size); barrier(CLK_LOCAL_MEM_FENCE); } __attribute__((always_inline)) -void fft_radix2_B2(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t) +void fft_radix2_B2(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t) { const int x2 = x1 + t/2; float2 a0, a1, a2, a3; @@ -151,7 +151,7 @@ void fft_radix2_B2(__local float2* smem, __global const float2* twiddles, const } __attribute__((always_inline)) -void fft_radix2_B3(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t) +void fft_radix2_B3(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t) { const int x2 = x1 + t/3; const int x3 = x1 + 2*t/3; @@ -177,7 +177,7 @@ void fft_radix2_B3(__local float2* smem, __global const float2* twiddles, const } __attribute__((always_inline)) -void fft_radix2_B4(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t) +void fft_radix2_B4(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t) { const int thread_block = t/4; const int x2 = x1 + thread_block; @@ -207,7 +207,7 @@ void fft_radix2_B4(__local float2* smem, __global const float2* twiddles, const } __attribute__((always_inline)) -void fft_radix2_B5(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t) +void fft_radix2_B5(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t) { const int thread_block = t/5; const int x2 = x1 + thread_block; @@ -326,7 +326,7 @@ void fft_radix8(__local float2* smem, __global const float2* twiddles, const int a7 = mul_float2(twiddles[k+6*block_size],smem[x+7*t]); float2 b0, b1, b6, b7; - + b0 = a0 + a4; a4 = a0 - a4; b1 = a1 + a5; @@ -335,7 +335,7 @@ void fft_radix8(__local float2* smem, __global const float2* twiddles, const int b6 = twiddle(a2 - a6); a2 = a2 + a6; b7 = a3 - a7; - b7 = (float2)(SQRT_2) * (float2)(-b7.x + b7.y, -b7.x - b7.y); + b7 = (float2)(SQRT_2) * (float2)(-b7.x + b7.y, -b7.x - b7.y); a3 = a3 + a7; a0 = b0 + a2; @@ -571,10 +571,15 @@ __kernel void fft_multi_radix_rows(__global const uchar* src_ptr, int src_step, } else { + // fill with zero other rows +#ifdef COMPLEX_OUTPUT __global float2* dst = (__global float2*)(dst_ptr + mad24(y, dst_step, dst_offset)); +#else + __global float* dst = (__global float*)(dst_ptr + mad24(y, dst_step, dst_offset)); +#endif #pragma unroll for (int i=x; i(df) << std::endl; + OCL_OFF(cv::dft(src, dst, dft_flags | cv::DFT_COMPLEX_OUTPUT)); + OCL_ON(cv::dft(usrc, udst, dft_flags | cv::DFT_COMPLEX_OUTPUT)); double eps = src.size().area() * 1e-4; EXPECT_MAT_NEAR(dst, udst, eps); @@ -185,15 +150,15 @@ OCL_TEST_P(MulSpectrums, Mat) OCL_INSTANTIATE_TEST_CASE_P(OCL_ImgProc, MulSpectrums, testing::Combine(Bool(), Bool())); -OCL_INSTANTIATE_TEST_CASE_P(Core, Dft, Combine(Values(cv::Size(10, 10), cv::Size(36, 36), cv::Size(512, 1), cv::Size(1280, 768)), - Values((OCL_FFT_TYPE) R2C, (OCL_FFT_TYPE) C2C, (OCL_FFT_TYPE) R2R, (OCL_FFT_TYPE) C2R), - Bool(), // DFT_INVERSE +OCL_INSTANTIATE_TEST_CASE_P(Core, Dft, Combine(Values(cv::Size(2, 3), cv::Size(5, 4), cv::Size(25, 20), + cv::Size(512, 1), cv::Size(1024, 768)), + Values(CV_32F, CV_64F), + Bool(), // inplace Bool(), // DFT_ROWS Bool(), // DFT_SCALE - Bool() // hint - ) + Bool()) // DFT_INVERSE ); } } // namespace cvtest::ocl -#endif // HAVE_OPENCL \ No newline at end of file +#endif // HAVE_OPENCL