Final refactoring, fixes
This commit is contained in:
@@ -292,7 +292,7 @@ OCL_PERF_TEST_P(MagnitudeFixture, Magnitude, ::testing::Combine(
|
||||
typedef Size_MatType TransposeFixture;
|
||||
|
||||
OCL_PERF_TEST_P(TransposeFixture, Transpose, ::testing::Combine(
|
||||
OCL_TEST_SIZES, Values(CV_8UC1, CV_32FC1, CV_8UC2, CV_32FC2, CV_8UC4, CV_32FC4)))
|
||||
OCL_TEST_SIZES, OCL_TEST_TYPES_134))
|
||||
{
|
||||
const Size_MatType_t params = GetParam();
|
||||
const Size srcSize = get<0>(params);
|
||||
|
||||
@@ -54,40 +54,21 @@ namespace ocl {
|
||||
|
||||
///////////// dft ////////////////////////
|
||||
|
||||
enum OCL_FFT_TYPE
|
||||
{
|
||||
R2R = 0, // real to real (CCS)
|
||||
C2R = 1, // complex to real
|
||||
R2C = 2, // real to complex
|
||||
C2C = 3 // complex to complex
|
||||
};
|
||||
|
||||
typedef tuple<OCL_FFT_TYPE, Size, int> DftParams;
|
||||
typedef tuple<Size, int> DftParams;
|
||||
typedef TestBaseWithParam<DftParams> DftFixture;
|
||||
|
||||
OCL_PERF_TEST_P(DftFixture, Dft, ::testing::Combine(Values(C2C, R2R, C2R, R2C),
|
||||
Values(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3, Size(1024, 1024), Size(512, 512), Size(2048, 2048)),
|
||||
Values((int) 0, (int)DFT_ROWS, (int)DFT_SCALE/*, (int)DFT_INVERSE,
|
||||
(int)DFT_INVERSE | DFT_SCALE, (int)DFT_ROWS | DFT_INVERSE*/)))
|
||||
OCL_PERF_TEST_P(DftFixture, Dft, ::testing::Combine(Values(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3),
|
||||
Values((int)DFT_ROWS, (int)DFT_SCALE, (int)DFT_INVERSE,
|
||||
(int)DFT_INVERSE | DFT_SCALE, (int)DFT_ROWS | DFT_INVERSE)))
|
||||
{
|
||||
const DftParams params = GetParam();
|
||||
const int dft_type = get<0>(params);
|
||||
const Size srcSize = get<1>(params);
|
||||
int flags = get<2>(params);
|
||||
const Size srcSize = get<0>(params);
|
||||
const int flags = get<1>(params);
|
||||
|
||||
int in_cn, out_cn;
|
||||
switch (dft_type)
|
||||
{
|
||||
case R2R: flags |= cv::DFT_REAL_OUTPUT; in_cn = 1; out_cn = 1; break;
|
||||
case C2R: flags |= cv::DFT_REAL_OUTPUT; in_cn = 2; out_cn = 2; break;
|
||||
case R2C: flags |= cv::DFT_COMPLEX_OUTPUT; in_cn = 1; out_cn = 2; break;
|
||||
case C2C: flags |= cv::DFT_COMPLEX_OUTPUT; in_cn = 2; out_cn = 2; break;
|
||||
}
|
||||
|
||||
UMat src(srcSize, CV_MAKE_TYPE(CV_32F, in_cn)), dst(srcSize, CV_MAKE_TYPE(CV_32F, out_cn));
|
||||
UMat src(srcSize, CV_32FC2), dst(srcSize, CV_32FC2);
|
||||
declare.in(src, WARMUP_RNG).out(dst);
|
||||
|
||||
OCL_TEST_CYCLE() cv::dft(src, dst, flags);
|
||||
OCL_TEST_CYCLE() cv::dft(src, dst, flags | DFT_COMPLEX_OUTPUT);
|
||||
|
||||
SANITY_CHECK(dst, 1e-3);
|
||||
}
|
||||
|
||||
@@ -1781,6 +1781,377 @@ static bool ippi_DFT_R_32F(const Mat& src, Mat& dst, bool inv, int norm_flag)
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef HAVE_OPENCL
|
||||
|
||||
namespace cv
|
||||
{
|
||||
|
||||
enum FftType
|
||||
{
|
||||
R2R = 0,
|
||||
C2R = 1,
|
||||
R2C = 2,
|
||||
C2C = 3
|
||||
};
|
||||
|
||||
static void ocl_getRadixes(int cols, std::vector<int>& radixes, std::vector<int>& blocks, int& min_radix)
|
||||
{
|
||||
int factors[34];
|
||||
int nf = DFTFactorize(cols, factors);
|
||||
|
||||
int n = 1;
|
||||
int factor_index = 0;
|
||||
min_radix = INT_MAX;
|
||||
|
||||
// 2^n transforms
|
||||
if ((factors[factor_index] & 1) == 0)
|
||||
{
|
||||
for( ; n < factors[factor_index];)
|
||||
{
|
||||
int radix = 2, block = 1;
|
||||
if (8*n <= factors[0])
|
||||
radix = 8;
|
||||
else if (4*n <= factors[0])
|
||||
{
|
||||
radix = 4;
|
||||
if (cols % 12 == 0)
|
||||
block = 3;
|
||||
else if (cols % 8 == 0)
|
||||
block = 2;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (cols % 10 == 0)
|
||||
block = 5;
|
||||
else if (cols % 8 == 0)
|
||||
block = 4;
|
||||
else if (cols % 6 == 0)
|
||||
block = 3;
|
||||
else if (cols % 4 == 0)
|
||||
block = 2;
|
||||
}
|
||||
|
||||
radixes.push_back(radix);
|
||||
blocks.push_back(block);
|
||||
min_radix = min(min_radix, block*radix);
|
||||
n *= radix;
|
||||
}
|
||||
factor_index++;
|
||||
}
|
||||
|
||||
// all the other transforms
|
||||
for( ; factor_index < nf; factor_index++)
|
||||
{
|
||||
int radix = factors[factor_index], block = 1;
|
||||
if (radix == 3)
|
||||
{
|
||||
if (cols % 12 == 0)
|
||||
block = 4;
|
||||
else if (cols % 9 == 0)
|
||||
block = 3;
|
||||
else if (cols % 6 == 0)
|
||||
block = 2;
|
||||
}
|
||||
else if (radix == 5)
|
||||
{
|
||||
if (cols % 10 == 0)
|
||||
block = 2;
|
||||
}
|
||||
radixes.push_back(radix);
|
||||
blocks.push_back(block);
|
||||
min_radix = min(min_radix, block*radix);
|
||||
}
|
||||
}
|
||||
|
||||
struct OCL_FftPlan
|
||||
{
|
||||
UMat twiddles;
|
||||
String buildOptions;
|
||||
int thread_count;
|
||||
|
||||
int dft_size;
|
||||
bool status;
|
||||
OCL_FftPlan(int _size): dft_size(_size), status(true)
|
||||
{
|
||||
int min_radix;
|
||||
std::vector<int> radixes, blocks;
|
||||
ocl_getRadixes(dft_size, radixes, blocks, min_radix);
|
||||
thread_count = dft_size / min_radix;
|
||||
|
||||
if (thread_count > (int) ocl::Device::getDefault().maxWorkGroupSize())
|
||||
{
|
||||
status = false;
|
||||
return;
|
||||
}
|
||||
|
||||
// generate string with radix calls
|
||||
String radix_processing;
|
||||
int n = 1, twiddle_size = 0;
|
||||
for (size_t i=0; i<radixes.size(); i++)
|
||||
{
|
||||
int radix = radixes[i], block = blocks[i];
|
||||
if (block > 1)
|
||||
radix_processing += format("fft_radix%d_B%d(smem,twiddles+%d,ind,%d,%d);", radix, block, twiddle_size, n, dft_size/radix);
|
||||
else
|
||||
radix_processing += format("fft_radix%d(smem,twiddles+%d,ind,%d,%d);", radix, twiddle_size, n, dft_size/radix);
|
||||
twiddle_size += (radix-1)*n;
|
||||
n *= radix;
|
||||
}
|
||||
|
||||
Mat tw(1, twiddle_size, CV_32FC2);
|
||||
float* ptr = tw.ptr<float>();
|
||||
int ptr_index = 0;
|
||||
|
||||
n = 1;
|
||||
for (size_t i=0; i<radixes.size(); i++)
|
||||
{
|
||||
int radix = radixes[i];
|
||||
n *= radix;
|
||||
|
||||
for (int j=1; j<radix; j++)
|
||||
{
|
||||
double theta = -CV_TWO_PI*j/n;
|
||||
|
||||
for (int k=0; k<(n/radix); k++)
|
||||
{
|
||||
ptr[ptr_index++] = (float) cos(k*theta);
|
||||
ptr[ptr_index++] = (float) sin(k*theta);
|
||||
}
|
||||
}
|
||||
}
|
||||
twiddles = tw.getUMat(ACCESS_READ);
|
||||
|
||||
buildOptions = format("-D LOCAL_SIZE=%d -D kercn=%d -D RADIX_PROCESS=%s",
|
||||
dft_size, dft_size/thread_count, radix_processing.c_str());
|
||||
}
|
||||
|
||||
bool enqueueTransform(InputArray _src, OutputArray _dst, int num_dfts, int flags, int fftType, bool rows = true) const
|
||||
{
|
||||
if (!status)
|
||||
return false;
|
||||
|
||||
UMat src = _src.getUMat();
|
||||
UMat dst = _dst.getUMat();
|
||||
|
||||
size_t globalsize[2];
|
||||
size_t localsize[2];
|
||||
String kernel_name;
|
||||
|
||||
bool is1d = (flags & DFT_ROWS) != 0 || num_dfts == 1;
|
||||
bool inv = (flags & DFT_INVERSE) != 0;
|
||||
String options = buildOptions;
|
||||
|
||||
if (rows)
|
||||
{
|
||||
globalsize[0] = thread_count; globalsize[1] = src.rows;
|
||||
localsize[0] = thread_count; localsize[1] = 1;
|
||||
kernel_name = !inv ? "fft_multi_radix_rows" : "ifft_multi_radix_rows";
|
||||
if ((is1d || inv) && (flags & DFT_SCALE))
|
||||
options += " -D DFT_SCALE";
|
||||
}
|
||||
else
|
||||
{
|
||||
globalsize[0] = num_dfts; globalsize[1] = thread_count;
|
||||
localsize[0] = 1; localsize[1] = thread_count;
|
||||
kernel_name = !inv ? "fft_multi_radix_cols" : "ifft_multi_radix_cols";
|
||||
if (flags & DFT_SCALE)
|
||||
options += " -D DFT_SCALE";
|
||||
}
|
||||
|
||||
options += src.channels() == 1 ? " -D REAL_INPUT" : " -D COMPLEX_INPUT";
|
||||
options += dst.channels() == 1 ? " -D REAL_OUTPUT" : " -D COMPLEX_OUTPUT";
|
||||
options += is1d ? " -D IS_1D" : "";
|
||||
|
||||
if (!inv)
|
||||
{
|
||||
if ((is1d && src.channels() == 1) || (rows && (fftType == R2R)))
|
||||
options += " -D NO_CONJUGATE";
|
||||
}
|
||||
else
|
||||
{
|
||||
if (rows && (fftType == C2R || fftType == R2R))
|
||||
options += " -D NO_CONJUGATE";
|
||||
if (dst.cols % 2 == 0)
|
||||
options += " -D EVEN";
|
||||
}
|
||||
|
||||
ocl::Kernel k(kernel_name.c_str(), ocl::core::fft_oclsrc, options);
|
||||
if (k.empty())
|
||||
return false;
|
||||
|
||||
k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(twiddles), thread_count, num_dfts);
|
||||
return k.run(2, globalsize, localsize, false);
|
||||
}
|
||||
};
|
||||
|
||||
class OCL_FftPlanCache
|
||||
{
|
||||
public:
|
||||
static OCL_FftPlanCache & getInstance()
|
||||
{
|
||||
static OCL_FftPlanCache planCache;
|
||||
return planCache;
|
||||
}
|
||||
|
||||
OCL_FftPlan* getFftPlan(int dft_size)
|
||||
{
|
||||
for (size_t i = 0, size = planStorage.size(); i < size; ++i)
|
||||
{
|
||||
OCL_FftPlan * const plan = planStorage[i];
|
||||
|
||||
if (plan->dft_size == dft_size)
|
||||
{
|
||||
return plan;
|
||||
}
|
||||
}
|
||||
|
||||
OCL_FftPlan * newPlan = new OCL_FftPlan(dft_size);
|
||||
planStorage.push_back(newPlan);
|
||||
return newPlan;
|
||||
}
|
||||
|
||||
~OCL_FftPlanCache()
|
||||
{
|
||||
for (std::vector<OCL_FftPlan *>::iterator i = planStorage.begin(), end = planStorage.end(); i != end; ++i)
|
||||
delete (*i);
|
||||
planStorage.clear();
|
||||
}
|
||||
|
||||
protected:
|
||||
OCL_FftPlanCache() :
|
||||
planStorage()
|
||||
{
|
||||
}
|
||||
|
||||
std::vector<OCL_FftPlan*> planStorage;
|
||||
};
|
||||
|
||||
static bool ocl_dft_C2C_rows(InputArray _src, OutputArray _dst, int nonzero_rows, int flags, int fftType)
|
||||
{
|
||||
const OCL_FftPlan* plan = OCL_FftPlanCache::getInstance().getFftPlan(_src.cols());
|
||||
return plan->enqueueTransform(_src, _dst, nonzero_rows, flags, fftType, true);
|
||||
}
|
||||
|
||||
static bool ocl_dft_C2C_cols(InputArray _src, OutputArray _dst, int nonzero_cols, int flags, int fftType)
|
||||
{
|
||||
const OCL_FftPlan* plan = OCL_FftPlanCache::getInstance().getFftPlan(_src.rows());
|
||||
return plan->enqueueTransform(_src, _dst, nonzero_cols, flags, fftType, false);
|
||||
}
|
||||
|
||||
static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_rows)
|
||||
{
|
||||
int type = _src.type(), cn = CV_MAT_CN(type);
|
||||
Size ssize = _src.size();
|
||||
if ( !(type == CV_32FC1 || type == CV_32FC2) )
|
||||
return false;
|
||||
|
||||
// if is not a multiplication of prime numbers { 2, 3, 5 }
|
||||
if (ssize.area() != getOptimalDFTSize(ssize.area()))
|
||||
return false;
|
||||
|
||||
UMat src = _src.getUMat();
|
||||
int complex_input = cn == 2 ? 1 : 0;
|
||||
int complex_output = (flags & DFT_COMPLEX_OUTPUT) != 0;
|
||||
int real_input = cn == 1 ? 1 : 0;
|
||||
int real_output = (flags & DFT_REAL_OUTPUT) != 0;
|
||||
bool inv = (flags & DFT_INVERSE) != 0 ? 1 : 0;
|
||||
|
||||
if( nonzero_rows <= 0 || nonzero_rows > _src.rows() )
|
||||
nonzero_rows = _src.rows();
|
||||
bool is1d = (flags & DFT_ROWS) != 0 || nonzero_rows == 1;
|
||||
|
||||
// if output format is not specified
|
||||
if (complex_output + real_output == 0)
|
||||
{
|
||||
if (real_input)
|
||||
real_output = 1;
|
||||
else
|
||||
complex_output = 1;
|
||||
}
|
||||
|
||||
FftType fftType = (FftType)(complex_input << 0 | complex_output << 1);
|
||||
|
||||
// Forward Complex to CCS not supported
|
||||
if (fftType == C2R && !inv)
|
||||
fftType = C2C;
|
||||
|
||||
// Inverse CCS to Complex not supported
|
||||
if (fftType == R2C && inv)
|
||||
fftType = R2R;
|
||||
|
||||
UMat output;
|
||||
if (fftType == C2C || fftType == R2C)
|
||||
{
|
||||
// complex output
|
||||
_dst.create(src.size(), CV_32FC2);
|
||||
output = _dst.getUMat();
|
||||
}
|
||||
else
|
||||
{
|
||||
// real output
|
||||
if (is1d)
|
||||
{
|
||||
_dst.create(src.size(), CV_32FC1);
|
||||
output = _dst.getUMat();
|
||||
}
|
||||
else
|
||||
{
|
||||
_dst.create(src.size(), CV_32FC1);
|
||||
output.create(src.size(), CV_32FC2);
|
||||
}
|
||||
}
|
||||
|
||||
if (!inv)
|
||||
{
|
||||
if (!ocl_dft_C2C_rows(src, output, nonzero_rows, flags, fftType))
|
||||
return false;
|
||||
|
||||
if (!is1d)
|
||||
{
|
||||
int nonzero_cols = fftType == R2R ? output.cols/2 + 1 : output.cols;
|
||||
if (!ocl_dft_C2C_cols(output, _dst, nonzero_cols, flags, fftType))
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (fftType == C2C)
|
||||
{
|
||||
// complex output
|
||||
if (!ocl_dft_C2C_rows(src, output, nonzero_rows, flags, fftType))
|
||||
return false;
|
||||
|
||||
if (!is1d)
|
||||
{
|
||||
if (!ocl_dft_C2C_cols(output, output, output.cols, flags, fftType))
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (is1d)
|
||||
{
|
||||
if (!ocl_dft_C2C_rows(src, output, nonzero_rows, flags, fftType))
|
||||
return false;
|
||||
}
|
||||
else
|
||||
{
|
||||
int nonzero_cols = src.cols/2 + 1;
|
||||
if (!ocl_dft_C2C_cols(src, output, nonzero_cols, flags, fftType))
|
||||
return false;
|
||||
|
||||
if (!ocl_dft_C2C_rows(output, _dst, nonzero_rows, flags, fftType))
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace cv;
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_CLAMDFFT
|
||||
|
||||
namespace cv {
|
||||
@@ -2011,7 +2382,6 @@ static bool ocl_dft_amdfft(InputArray _src, OutputArray _dst, int flags)
|
||||
|
||||
tmpBuffer.addref();
|
||||
clSetEventCallback(e, CL_COMPLETE, oclCleanupCallback, tmpBuffer.u);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -2021,381 +2391,6 @@ static bool ocl_dft_amdfft(InputArray _src, OutputArray _dst, int flags)
|
||||
|
||||
#endif // HAVE_CLAMDFFT
|
||||
|
||||
namespace cv
|
||||
{
|
||||
|
||||
#ifdef HAVE_OPENCL
|
||||
|
||||
enum FftType
|
||||
{
|
||||
R2R = 0,
|
||||
C2R = 1,
|
||||
R2C = 2,
|
||||
C2C = 3
|
||||
};
|
||||
|
||||
static void ocl_getRadixes(int cols, std::vector<int>& radixes, std::vector<int>& blocks, int& min_radix)
|
||||
{
|
||||
int factors[34];
|
||||
int nf = DFTFactorize(cols, factors);
|
||||
|
||||
int n = 1;
|
||||
int factor_index = 0;
|
||||
min_radix = INT_MAX;
|
||||
|
||||
// 2^n transforms
|
||||
if ((factors[factor_index] & 1) == 0)
|
||||
{
|
||||
for( ; n < factors[factor_index];)
|
||||
{
|
||||
int radix = 2, block = 1;
|
||||
if (8*n <= factors[0])
|
||||
radix = 8;
|
||||
else if (4*n <= factors[0])
|
||||
{
|
||||
radix = 4;
|
||||
if (cols % 12 == 0)
|
||||
block = 3;
|
||||
else if (cols % 8 == 0)
|
||||
block = 2;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (cols % 10 == 0)
|
||||
block = 5;
|
||||
else if (cols % 8 == 0)
|
||||
block = 4;
|
||||
else if (cols % 6 == 0)
|
||||
block = 3;
|
||||
else if (cols % 4 == 0)
|
||||
block = 2;
|
||||
}
|
||||
|
||||
radixes.push_back(radix);
|
||||
blocks.push_back(block);
|
||||
min_radix = min(min_radix, block*radix);
|
||||
n *= radix;
|
||||
}
|
||||
factor_index++;
|
||||
}
|
||||
|
||||
// all the other transforms
|
||||
for( ; factor_index < nf; factor_index++)
|
||||
{
|
||||
int radix = factors[factor_index], block = 1;
|
||||
if (radix == 3)
|
||||
{
|
||||
if (cols % 12 == 0)
|
||||
block = 4;
|
||||
else if (cols % 9 == 0)
|
||||
block = 3;
|
||||
else if (cols % 6 == 0)
|
||||
block = 2;
|
||||
}
|
||||
else if (radix == 5)
|
||||
{
|
||||
if (cols % 10 == 0)
|
||||
block = 2;
|
||||
}
|
||||
radixes.push_back(radix);
|
||||
blocks.push_back(block);
|
||||
min_radix = min(min_radix, block*radix);
|
||||
}
|
||||
}
|
||||
|
||||
struct OCL_FftPlan
|
||||
{
|
||||
UMat twiddles;
|
||||
String buildOptions;
|
||||
int thread_count;
|
||||
|
||||
int dft_size;
|
||||
bool status;
|
||||
OCL_FftPlan(int _size): dft_size(_size), status(true)
|
||||
{
|
||||
int min_radix;
|
||||
std::vector<int> radixes, blocks;
|
||||
ocl_getRadixes(dft_size, radixes, blocks, min_radix);
|
||||
thread_count = dft_size / min_radix;
|
||||
|
||||
if (thread_count > ocl::Device::getDefault().maxWorkGroupSize())
|
||||
{
|
||||
status = false;
|
||||
return;
|
||||
}
|
||||
|
||||
// generate string with radix calls
|
||||
String radix_processing;
|
||||
int n = 1, twiddle_size = 0;
|
||||
for (size_t i=0; i<radixes.size(); i++)
|
||||
{
|
||||
int radix = radixes[i], block = blocks[i];
|
||||
if (block > 1)
|
||||
radix_processing += format("fft_radix%d_B%d(smem,twiddles+%d,ind,%d,%d);", radix, block, twiddle_size, n, dft_size/radix);
|
||||
else
|
||||
radix_processing += format("fft_radix%d(smem,twiddles+%d,ind,%d,%d);", radix, twiddle_size, n, dft_size/radix);
|
||||
twiddle_size += (radix-1)*n;
|
||||
n *= radix;
|
||||
}
|
||||
|
||||
Mat tw(1, twiddle_size, CV_32FC2);
|
||||
float* ptr = tw.ptr<float>();
|
||||
int ptr_index = 0;
|
||||
|
||||
n = 1;
|
||||
for (size_t i=0; i<radixes.size(); i++)
|
||||
{
|
||||
int radix = radixes[i];
|
||||
n *= radix;
|
||||
|
||||
for (int j=1; j<radix; j++)
|
||||
{
|
||||
double theta = -CV_TWO_PI*j/n;
|
||||
|
||||
for (int k=0; k<(n/radix); k++)
|
||||
{
|
||||
ptr[ptr_index++] = (float) cos(k*theta);
|
||||
ptr[ptr_index++] = (float) sin(k*theta);
|
||||
}
|
||||
}
|
||||
}
|
||||
twiddles = tw.getUMat(ACCESS_READ);
|
||||
|
||||
buildOptions = format("-D LOCAL_SIZE=%d -D kercn=%d -D RADIX_PROCESS=%s",
|
||||
dft_size, dft_size/thread_count, radix_processing.c_str());
|
||||
}
|
||||
|
||||
bool enqueueTransform(InputArray _src, OutputArray _dst, int dft_size, int flags, int fftType, bool rows = true) const
|
||||
{
|
||||
if (!status)
|
||||
return false;
|
||||
|
||||
UMat src = _src.getUMat();
|
||||
UMat dst = _dst.getUMat();
|
||||
|
||||
size_t globalsize[2];
|
||||
size_t localsize[2];
|
||||
String kernel_name;
|
||||
|
||||
bool is1d = (flags & DFT_ROWS) != 0 || dft_size == 1;
|
||||
bool inv = (flags & DFT_INVERSE) != 0;
|
||||
String options = buildOptions;
|
||||
|
||||
if (rows)
|
||||
{
|
||||
globalsize[0] = thread_count; globalsize[1] = src.rows;
|
||||
localsize[0] = thread_count; localsize[1] = 1;
|
||||
kernel_name = !inv ? "fft_multi_radix_rows" : "ifft_multi_radix_rows";
|
||||
if ((is1d || inv) && (flags & DFT_SCALE))
|
||||
options += " -D DFT_SCALE";
|
||||
}
|
||||
else
|
||||
{
|
||||
globalsize[0] = dft_size; globalsize[1] = thread_count;
|
||||
localsize[0] = 1; localsize[1] = thread_count;
|
||||
kernel_name = !inv ? "fft_multi_radix_cols" : "ifft_multi_radix_cols";
|
||||
if (flags & DFT_SCALE)
|
||||
options += " -D DFT_SCALE";
|
||||
}
|
||||
|
||||
options += src.channels() == 1 ? " -D REAL_INPUT" : " -D COMPLEX_INPUT";
|
||||
options += dst.channels() == 1 ? " -D REAL_OUTPUT" : " -D COMPLEX_OUTPUT";
|
||||
options += is1d ? " -D IS_1D" : "";
|
||||
|
||||
if (!inv)
|
||||
{
|
||||
if ((is1d && src.channels() == 1) || (rows && (fftType == R2R)))
|
||||
options += " -D NO_CONJUGATE";
|
||||
}
|
||||
else
|
||||
{
|
||||
if (rows && (fftType == C2R || fftType == R2R))
|
||||
options += " -D NO_CONJUGATE";
|
||||
if (dst.cols % 2 == 0)
|
||||
options += " -D EVEN";
|
||||
}
|
||||
|
||||
ocl::Kernel k(kernel_name.c_str(), ocl::core::fft_oclsrc, options);
|
||||
if (k.empty())
|
||||
return false;
|
||||
|
||||
k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(twiddles), thread_count, dft_size);
|
||||
return k.run(2, globalsize, localsize, false);
|
||||
}
|
||||
};
|
||||
|
||||
class OCL_FftPlanCache
|
||||
{
|
||||
public:
|
||||
static OCL_FftPlanCache & getInstance()
|
||||
{
|
||||
static OCL_FftPlanCache planCache;
|
||||
return planCache;
|
||||
}
|
||||
|
||||
OCL_FftPlan* getFftPlan(int dft_size)
|
||||
{
|
||||
for (size_t i = 0, size = planStorage.size(); i < size; ++i)
|
||||
{
|
||||
OCL_FftPlan * const plan = planStorage[i];
|
||||
|
||||
if (plan->dft_size == dft_size)
|
||||
{
|
||||
return plan;
|
||||
}
|
||||
}
|
||||
|
||||
OCL_FftPlan * newPlan = new OCL_FftPlan(dft_size);
|
||||
planStorage.push_back(newPlan);
|
||||
return newPlan;
|
||||
}
|
||||
|
||||
~OCL_FftPlanCache()
|
||||
{
|
||||
for (std::vector<OCL_FftPlan *>::iterator i = planStorage.begin(), end = planStorage.end(); i != end; ++i)
|
||||
delete (*i);
|
||||
planStorage.clear();
|
||||
}
|
||||
|
||||
protected:
|
||||
OCL_FftPlanCache() :
|
||||
planStorage()
|
||||
{
|
||||
}
|
||||
|
||||
std::vector<OCL_FftPlan*> planStorage;
|
||||
};
|
||||
|
||||
static bool ocl_dft_C2C_rows(InputArray _src, OutputArray _dst, int nonzero_rows, int flags, int fftType)
|
||||
{
|
||||
const OCL_FftPlan* plan = OCL_FftPlanCache::getInstance().getFftPlan(_src.cols());
|
||||
return plan->enqueueTransform(_src, _dst, nonzero_rows, flags, fftType, true);
|
||||
}
|
||||
|
||||
static bool ocl_dft_C2C_cols(InputArray _src, OutputArray _dst, int nonzero_cols, int flags, int fftType)
|
||||
{
|
||||
const OCL_FftPlan* plan = OCL_FftPlanCache::getInstance().getFftPlan(_src.rows());
|
||||
return plan->enqueueTransform(_src, _dst, nonzero_cols, flags, fftType, false);
|
||||
}
|
||||
|
||||
static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_rows)
|
||||
{
|
||||
int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
|
||||
Size ssize = _src.size();
|
||||
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
|
||||
if ( (!doubleSupport && depth == CV_64F) ||
|
||||
!(type == CV_32FC1 || type == CV_32FC2 || type == CV_64FC1 || type == CV_64FC2))
|
||||
return false;
|
||||
|
||||
// if is not a multiplication of prime numbers { 2, 3, 5 }
|
||||
if (ssize.area() != getOptimalDFTSize(ssize.area()))
|
||||
return false;
|
||||
|
||||
UMat src = _src.getUMat();
|
||||
int complex_input = cn == 2 ? 1 : 0;
|
||||
int complex_output = (flags & DFT_COMPLEX_OUTPUT) != 0;
|
||||
int real_input = cn == 1 ? 1 : 0;
|
||||
int real_output = (flags & DFT_REAL_OUTPUT) != 0;
|
||||
bool inv = (flags & DFT_INVERSE) != 0 ? 1 : 0;
|
||||
|
||||
if( nonzero_rows <= 0 || nonzero_rows > _src.rows() )
|
||||
nonzero_rows = _src.rows();
|
||||
bool is1d = (flags & DFT_ROWS) != 0 || nonzero_rows == 1;
|
||||
|
||||
// if output format is not specified
|
||||
if (complex_output + real_output == 0)
|
||||
{
|
||||
if (real_input)
|
||||
real_output = 1;
|
||||
else
|
||||
complex_output = 1;
|
||||
}
|
||||
|
||||
FftType fftType = (FftType)(complex_input << 0 | complex_output << 1);
|
||||
|
||||
// Forward Complex to CCS not supported
|
||||
if (fftType == C2R && !inv)
|
||||
fftType = C2C;
|
||||
|
||||
// Inverse CCS to Complex not supported
|
||||
if (fftType == R2C && inv)
|
||||
fftType = R2R;
|
||||
|
||||
UMat output;
|
||||
if (fftType == C2C || fftType == R2C)
|
||||
{
|
||||
// complex output
|
||||
_dst.create(src.size(), CV_32FC2);
|
||||
output = _dst.getUMat();
|
||||
}
|
||||
else
|
||||
{
|
||||
// real output
|
||||
if (is1d)
|
||||
{
|
||||
_dst.create(src.size(), CV_32FC1);
|
||||
output = _dst.getUMat();
|
||||
}
|
||||
else
|
||||
{
|
||||
_dst.create(src.size(), CV_32FC1);
|
||||
output.create(src.size(), CV_32FC2);
|
||||
}
|
||||
}
|
||||
|
||||
if (!inv)
|
||||
{
|
||||
if (!ocl_dft_C2C_rows(src, output, nonzero_rows, flags, fftType))
|
||||
return false;
|
||||
|
||||
if (!is1d)
|
||||
{
|
||||
int nonzero_cols = fftType == R2R ? output.cols/2 + 1 : output.cols;
|
||||
if (!ocl_dft_C2C_cols(output, _dst, nonzero_cols, flags, fftType))
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (fftType == C2C)
|
||||
{
|
||||
// complex output
|
||||
if (!ocl_dft_C2C_rows(src, output, nonzero_rows, flags, fftType))
|
||||
return false;
|
||||
|
||||
if (!is1d)
|
||||
{
|
||||
if (!ocl_dft_C2C_cols(output, output, output.cols, flags, fftType))
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (is1d)
|
||||
{
|
||||
if (!ocl_dft_C2C_rows(src, output, nonzero_rows, flags, fftType))
|
||||
return false;
|
||||
}
|
||||
else
|
||||
{
|
||||
int nonzero_cols = src.cols/2 + 1;
|
||||
if (!ocl_dft_C2C_cols(src, output, nonzero_cols, flags, fftType))
|
||||
return false;
|
||||
|
||||
if (!ocl_dft_C2C_rows(output, _dst, nonzero_rows, flags, fftType))
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
} // namespace cv;
|
||||
|
||||
|
||||
|
||||
void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows )
|
||||
{
|
||||
#ifdef HAVE_CLAMDFFT
|
||||
|
||||
@@ -3002,8 +3002,7 @@ bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[],
|
||||
sync ? 0 : &p->e);
|
||||
if( sync || retval != CL_SUCCESS )
|
||||
{
|
||||
int a = clFinish(qq);
|
||||
CV_OclDbgAssert(a == CL_SUCCESS);
|
||||
CV_OclDbgAssert(clFinish(qq) == CL_SUCCESS);
|
||||
p->cleanupUMats();
|
||||
}
|
||||
else
|
||||
@@ -3899,9 +3898,8 @@ public:
|
||||
if( (accessFlags & ACCESS_READ) != 0 && u->hostCopyObsolete() )
|
||||
{
|
||||
AlignedDataPtr<false, true> alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
|
||||
int a = clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
|
||||
u->size, alignedPtr.getAlignedPtr(), 0, 0, 0);
|
||||
CV_Assert( a == CL_SUCCESS );
|
||||
CV_Assert( clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
|
||||
u->size, alignedPtr.getAlignedPtr(), 0, 0, 0) == CL_SUCCESS );
|
||||
u->markHostCopyObsolete(false);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -571,10 +571,15 @@ __kernel void fft_multi_radix_rows(__global const uchar* src_ptr, int src_step,
|
||||
}
|
||||
else
|
||||
{
|
||||
// fill with zero other rows
|
||||
#ifdef COMPLEX_OUTPUT
|
||||
__global float2* dst = (__global float2*)(dst_ptr + mad24(y, dst_step, dst_offset));
|
||||
#else
|
||||
__global float* dst = (__global float*)(dst_ptr + mad24(y, dst_step, dst_offset));
|
||||
#endif
|
||||
#pragma unroll
|
||||
for (int i=x; i<dst_cols; i+=block_size)
|
||||
dst[i] = (float2) 0.f;
|
||||
dst[i] = 0.f;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -667,12 +672,9 @@ __kernel void ifft_multi_radix_rows(__global const uchar* src_ptr, int src_step,
|
||||
smem[x+i*block_size].y = -src[i*block_size].y;
|
||||
}
|
||||
#else
|
||||
__global const float2* src;
|
||||
|
||||
#if !defined(REAL_INPUT) && defined(NO_CONJUGATE)
|
||||
src = (__global const float2*)(src_ptr + mad24(y, src_step, mad24(2, (int)sizeof(float), src_offset)));
|
||||
#else
|
||||
src = (__global const float2*)(src_ptr + mad24(y, src_step, mad24(1, (int)sizeof(float), src_offset)));
|
||||
#endif
|
||||
__global const float2* src = (__global const float2*)(src_ptr + mad24(y, src_step, mad24(2, (int)sizeof(float), src_offset)));
|
||||
|
||||
#pragma unroll
|
||||
for (int i=x; i<(LOCAL_SIZE-1)/2; i+=block_size)
|
||||
@@ -681,6 +683,20 @@ __kernel void ifft_multi_radix_rows(__global const uchar* src_ptr, int src_step,
|
||||
smem[i+1].y = -src[i].y;
|
||||
smem[LOCAL_SIZE-i-1] = src[i];
|
||||
}
|
||||
#else
|
||||
|
||||
#pragma unroll
|
||||
for (int i=x; i<(LOCAL_SIZE-1)/2; i+=block_size)
|
||||
{
|
||||
float2 src = vload2(0, (__global const float*)(src_ptr + mad24(y, src_step, mad24(2*i+1, (int)sizeof(float), src_offset))));
|
||||
|
||||
smem[i+1].x = src.x;
|
||||
smem[i+1].y = -src.y;
|
||||
smem[LOCAL_SIZE-i-1] = src;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
if (x==0)
|
||||
{
|
||||
smem[0].x = *(__global const float*)(src_ptr + mad24(y, src_step, src_offset));
|
||||
@@ -688,7 +704,11 @@ __kernel void ifft_multi_radix_rows(__global const uchar* src_ptr, int src_step,
|
||||
|
||||
if(LOCAL_SIZE % 2 ==0)
|
||||
{
|
||||
#if !defined(REAL_INPUT) && defined(NO_CONJUGATE)
|
||||
smem[LOCAL_SIZE/2].x = src[LOCAL_SIZE/2-1].x;
|
||||
#else
|
||||
smem[LOCAL_SIZE/2].x = *(__global const float*)(src_ptr + mad24(y, src_step, mad24(LOCAL_SIZE-1, (int)sizeof(float), src_offset)));
|
||||
#endif
|
||||
smem[LOCAL_SIZE/2].y = 0.f;
|
||||
}
|
||||
}
|
||||
@@ -718,10 +738,15 @@ __kernel void ifft_multi_radix_rows(__global const uchar* src_ptr, int src_step,
|
||||
}
|
||||
else
|
||||
{
|
||||
__global float2* dst = (__global float*)(dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(float)*2), dst_offset)));
|
||||
// fill with zero other rows
|
||||
#ifdef COMPLEX_OUTPUT
|
||||
__global float2* dst = (__global float2*)(dst_ptr + mad24(y, dst_step, dst_offset));
|
||||
#else
|
||||
__global float* dst = (__global float*)(dst_ptr + mad24(y, dst_step, dst_offset));
|
||||
#endif
|
||||
#pragma unroll
|
||||
for (int i=0; i<kercn; i++)
|
||||
dst[i*block_size] = (float2) 0.f;
|
||||
for (int i=x; i<dst_cols; i+=block_size)
|
||||
dst[i] = 0.f;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -781,7 +806,7 @@ __kernel void ifft_multi_radix_cols(__global const uchar* src_ptr, int src_step,
|
||||
#pragma unroll
|
||||
for (int i=0; i<kercn; i++)
|
||||
{
|
||||
float2 temp = *((__global const float2*)(src + i*block_size*src_step));
|
||||
float2 temp = vload2(0, (__global const float*)(src + i*block_size*src_step));
|
||||
smem[y+i*block_size].x = temp.x;
|
||||
smem[y+i*block_size].y = -temp.y;
|
||||
}
|
||||
|
||||
@@ -48,26 +48,17 @@
|
||||
|
||||
#ifdef HAVE_OPENCL
|
||||
|
||||
enum OCL_FFT_TYPE
|
||||
{
|
||||
R2R = 0,
|
||||
C2R = 1,
|
||||
R2C = 2,
|
||||
C2C = 3
|
||||
};
|
||||
|
||||
namespace cvtest {
|
||||
namespace ocl {
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
// Dft
|
||||
|
||||
PARAM_TEST_CASE(Dft, cv::Size, OCL_FFT_TYPE, bool, bool, bool, bool)
|
||||
PARAM_TEST_CASE(Dft, cv::Size, MatDepth, bool, bool, bool, bool)
|
||||
{
|
||||
cv::Size dft_size;
|
||||
int dft_flags, depth, cn, dft_type;
|
||||
bool hint;
|
||||
bool is1d;
|
||||
int dft_flags, depth;
|
||||
bool inplace;
|
||||
|
||||
TEST_DECLARE_INPUT_PARAMETER(src);
|
||||
TEST_DECLARE_OUTPUT_PARAMETER(dst);
|
||||
@@ -75,60 +66,34 @@ PARAM_TEST_CASE(Dft, cv::Size, OCL_FFT_TYPE, bool, bool, bool, bool)
|
||||
virtual void SetUp()
|
||||
{
|
||||
dft_size = GET_PARAM(0);
|
||||
dft_type = GET_PARAM(1);
|
||||
depth = CV_32F;
|
||||
depth = GET_PARAM(1);
|
||||
inplace = GET_PARAM(2);
|
||||
|
||||
dft_flags = 0;
|
||||
switch (dft_type)
|
||||
{
|
||||
case R2R: dft_flags |= cv::DFT_REAL_OUTPUT; cn = 1; break;
|
||||
case C2R: dft_flags |= cv::DFT_REAL_OUTPUT; cn = 2; break;
|
||||
case R2C: dft_flags |= cv::DFT_COMPLEX_OUTPUT; cn = 1; break;
|
||||
case C2C: dft_flags |= cv::DFT_COMPLEX_OUTPUT; cn = 2; break;
|
||||
}
|
||||
|
||||
if (GET_PARAM(2))
|
||||
dft_flags |= cv::DFT_INVERSE;
|
||||
if (GET_PARAM(3))
|
||||
dft_flags |= cv::DFT_ROWS;
|
||||
if (GET_PARAM(4))
|
||||
dft_flags |= cv::DFT_SCALE;
|
||||
hint = GET_PARAM(5);
|
||||
is1d = (dft_flags & DFT_ROWS) != 0 || dft_size.height == 1;
|
||||
if (GET_PARAM(5))
|
||||
dft_flags |= cv::DFT_INVERSE;
|
||||
}
|
||||
|
||||
void generateTestData()
|
||||
void generateTestData(int cn = 2)
|
||||
{
|
||||
src = randomMat(dft_size, CV_MAKE_TYPE(depth, cn), 0.0, 100.0);
|
||||
usrc = src.getUMat(ACCESS_READ);
|
||||
|
||||
if (inplace)
|
||||
dst = src, udst = usrc;
|
||||
}
|
||||
};
|
||||
|
||||
OCL_TEST_P(Dft, Mat)
|
||||
OCL_TEST_P(Dft, C2C)
|
||||
{
|
||||
generateTestData();
|
||||
|
||||
int nonzero_rows = hint ? src.cols - randomInt(1, src.rows-1) : 0;
|
||||
OCL_OFF(cv::dft(src, dst, dft_flags, nonzero_rows));
|
||||
OCL_ON(cv::dft(usrc, udst, dft_flags, nonzero_rows));
|
||||
|
||||
if (dft_type == R2C && is1d && (dft_flags & cv::DFT_INVERSE) == 0)
|
||||
{
|
||||
dst = dst(cv::Range(0, dst.rows), cv::Range(0, dst.cols/2 + 1));
|
||||
udst = udst(cv::Range(0, udst.rows), cv::Range(0, udst.cols/2 + 1));
|
||||
}
|
||||
|
||||
//Mat gpu = udst.getMat(ACCESS_READ);
|
||||
//std::cout << dst << std::endl;
|
||||
//std::cout << gpu << std::endl;
|
||||
|
||||
//int cn = udst.channels();
|
||||
//
|
||||
//Mat dst1ch = dst.reshape(1);
|
||||
//Mat gpu1ch = gpu.reshape(1);
|
||||
//Mat df;
|
||||
//absdiff(dst1ch, gpu1ch, df);
|
||||
//std::cout << Mat_<int>(df) << std::endl;
|
||||
OCL_OFF(cv::dft(src, dst, dft_flags | cv::DFT_COMPLEX_OUTPUT));
|
||||
OCL_ON(cv::dft(usrc, udst, dft_flags | cv::DFT_COMPLEX_OUTPUT));
|
||||
|
||||
double eps = src.size().area() * 1e-4;
|
||||
EXPECT_MAT_NEAR(dst, udst, eps);
|
||||
@@ -185,13 +150,13 @@ OCL_TEST_P(MulSpectrums, Mat)
|
||||
|
||||
OCL_INSTANTIATE_TEST_CASE_P(OCL_ImgProc, MulSpectrums, testing::Combine(Bool(), Bool()));
|
||||
|
||||
OCL_INSTANTIATE_TEST_CASE_P(Core, Dft, Combine(Values(cv::Size(10, 10), cv::Size(36, 36), cv::Size(512, 1), cv::Size(1280, 768)),
|
||||
Values((OCL_FFT_TYPE) R2C, (OCL_FFT_TYPE) C2C, (OCL_FFT_TYPE) R2R, (OCL_FFT_TYPE) C2R),
|
||||
Bool(), // DFT_INVERSE
|
||||
OCL_INSTANTIATE_TEST_CASE_P(Core, Dft, Combine(Values(cv::Size(2, 3), cv::Size(5, 4), cv::Size(25, 20),
|
||||
cv::Size(512, 1), cv::Size(1024, 768)),
|
||||
Values(CV_32F, CV_64F),
|
||||
Bool(), // inplace
|
||||
Bool(), // DFT_ROWS
|
||||
Bool(), // DFT_SCALE
|
||||
Bool() // hint
|
||||
)
|
||||
Bool()) // DFT_INVERSE
|
||||
);
|
||||
|
||||
} } // namespace cvtest::ocl
|
||||
|
||||
Reference in New Issue
Block a user