From e05112a36482ffef44243dc69ed9a047162a2d80 Mon Sep 17 00:00:00 2001 From: yao Date: Thu, 17 Jan 2013 09:34:43 +0800 Subject: [PATCH] some host side optimizations to ocl::GaussianBlur --- modules/ocl/src/filtering.cpp | 345 +++++++++++++++++++++------------- 1 file changed, 215 insertions(+), 130 deletions(-) diff --git a/modules/ocl/src/filtering.cpp b/modules/ocl/src/filtering.cpp index d0500059f..18fed6e8c 100644 --- a/modules/ocl/src/filtering.cpp +++ b/modules/ocl/src/filtering.cpp @@ -47,6 +47,7 @@ //M*/ #include "precomp.hpp" +#include "mcwutil.hpp" #include using namespace std; using namespace cv; @@ -109,7 +110,7 @@ Ptr cv::ocl::createLinearFilter_GPU(int, int, const Mat &, con return Ptr(0); } -Ptr cv::ocl::createDerivFilter_GPU( int srcType, int dstType, int dx, int dy, int ksize, int borderType ) +Ptr cv::ocl::createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, int borderType) { throw_nogpu(); return Ptr(0); @@ -150,17 +151,17 @@ void cv::ocl::Laplacian(const oclMat &, oclMat &, int, int, double) throw_nogpu(); } -void cv::ocl::erode( const oclMat &, oclMat &, const Mat &, Point, int) +void cv::ocl::erode(const oclMat &, oclMat &, const Mat &, Point, int) { throw_nogpu(); } -void cv::ocl::dilate( const oclMat &, oclMat &, const Mat &, Point, int) +void cv::ocl::dilate(const oclMat &, oclMat &, const Mat &, Point, int) { throw_nogpu(); } -void cv::ocl::morphologyEx( const oclMat &, oclMat &, int, const Mat &, Point, int) +void cv::ocl::morphologyEx(const oclMat &, oclMat &, int, const Mat &, Point, int) { throw_nogpu(); } @@ -194,7 +195,9 @@ namespace inline void normalizeAnchor(int &anchor, int ksize) { if (anchor < 0) + { anchor = ksize >> 1; + } CV_Assert(0 <= anchor && anchor < ksize); } @@ -208,7 +211,10 @@ inline void normalizeAnchor(Point &anchor, const Size &ksize) inline void normalizeROI(Rect &roi, const Size &ksize, const Point &anchor, const Size &src_size) { if (roi == Rect(0, 0, -1, -1)) + { roi = Rect(0, 0, src_size.width, src_size.height); + } + CV_Assert(ksize.height > 0 && ksize.width > 0 && ((ksize.height & 1) == 1) && ((ksize.width & 1) == 1)); CV_Assert((anchor.x == -1 && anchor.y == -1) || (anchor.x == ksize.width >> 1 && anchor.y == ksize.height >> 1)); CV_Assert(roi.x >= 0 && roi.y >= 0 && roi.width <= src_size.width && roi.height <= src_size.height); @@ -218,7 +224,11 @@ inline void normalizeROI(Rect &roi, const Size &ksize, const Point &anchor, cons inline void normalizeKernel(const Mat &kernel, oclMat &gpu_krnl, int type = CV_8U, int *nDivisor = 0, bool reverse = false) { int scale = nDivisor && (kernel.depth() == CV_32F || kernel.depth() == CV_64F) ? 256 : 1; - if (nDivisor) *nDivisor = scale; + + if (nDivisor) + { + *nDivisor = scale; + } Mat temp(kernel.size(), type); kernel.convertTo(temp, type, scale); @@ -227,6 +237,7 @@ inline void normalizeKernel(const Mat &kernel, oclMat &gpu_krnl, int type = CV_8 if (reverse) { int count = cont_krnl.cols >> 1; + for (int i = 0; i < count; ++i) { std::swap(cont_krnl.at(0, i), cont_krnl.at(0, cont_krnl.cols - 1 - i)); @@ -326,9 +337,9 @@ void GPUErode(const oclMat &src, oclMat &dst, oclMat &mat_kernel, Size &ksize, c //Normalize the result by default //float alpha = ksize.height * ksize.width; CV_Assert(src.clCxt == dst.clCxt); - CV_Assert( (src.cols == dst.cols) && - (src.rows == dst.rows) ); - CV_Assert( (src.oclchannels() == dst.oclchannels()) ); + CV_Assert((src.cols == dst.cols) && + (src.rows == dst.rows)); + CV_Assert((src.oclchannels() == dst.oclchannels())); int srcStep = src.step1() / src.oclchannels(); int dstStep = dst.step1() / dst.oclchannels(); @@ -342,19 +353,21 @@ void GPUErode(const oclMat &src, oclMat &dst, oclMat &mat_kernel, Size &ksize, c size_t localThreads[3] = {16, 16, 1}; size_t globalThreads[3] = {(src.cols + localThreads[0] - 1) / localThreads[0] *localThreads[0], (src.rows + localThreads[1] - 1) / localThreads[1] *localThreads[1], 1}; - if(src.type() == CV_8UC1) + if (src.type() == CV_8UC1) { kernelName = "morph_C1_D0"; globalThreads[0] = ((src.cols + 3) / 4 + localThreads[0] - 1) / localThreads[0] * localThreads[0]; - CV_Assert( localThreads[0]*localThreads[1] * 8 >= (localThreads[0] * 4 + ksize.width - 1) * (localThreads[1] + ksize.height - 1) ); + CV_Assert(localThreads[0]*localThreads[1] * 8 >= (localThreads[0] * 4 + ksize.width - 1) * (localThreads[1] + ksize.height - 1)); } else { kernelName = "morph"; - CV_Assert( localThreads[0]*localThreads[1] * 2 >= (localThreads[0] + ksize.width - 1) * (localThreads[1] + ksize.height - 1) ); + CV_Assert(localThreads[0]*localThreads[1] * 2 >= (localThreads[0] + ksize.width - 1) * (localThreads[1] + ksize.height - 1)); } + char s[64]; - switch(src.type()) + + switch (src.type()) { case CV_8UC1: sprintf(s, "-D VAL=255"); @@ -373,21 +386,22 @@ void GPUErode(const oclMat &src, oclMat &dst, oclMat &mat_kernel, Size &ksize, c default: CV_Error(CV_StsUnsupportedFormat, "unsupported type"); } + char compile_option[128]; sprintf(compile_option, "-D RADIUSX=%d -D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D ERODE %s", anchor.x, anchor.y, localThreads[0], localThreads[1], s); vector< pair > args; - args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data)); - args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data)); - args.push_back( make_pair( sizeof(cl_int), (void *)&srcOffset_x)); - args.push_back( make_pair( sizeof(cl_int), (void *)&srcOffset_y)); - args.push_back( make_pair( sizeof(cl_int), (void *)&src.cols)); - args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows)); - args.push_back( make_pair( sizeof(cl_int), (void *)&srcStep)); - args.push_back( make_pair( sizeof(cl_int), (void *)&dstStep)); - args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_kernel.data)); - args.push_back( make_pair( sizeof(cl_int), (void *)&src.wholecols)); - args.push_back( make_pair( sizeof(cl_int), (void *)&src.wholerows)); - args.push_back( make_pair( sizeof(cl_int), (void *)&dstOffset)); + args.push_back(make_pair(sizeof(cl_mem), (void *)&src.data)); + args.push_back(make_pair(sizeof(cl_mem), (void *)&dst.data)); + args.push_back(make_pair(sizeof(cl_int), (void *)&srcOffset_x)); + args.push_back(make_pair(sizeof(cl_int), (void *)&srcOffset_y)); + args.push_back(make_pair(sizeof(cl_int), (void *)&src.cols)); + args.push_back(make_pair(sizeof(cl_int), (void *)&src.rows)); + args.push_back(make_pair(sizeof(cl_int), (void *)&srcStep)); + args.push_back(make_pair(sizeof(cl_int), (void *)&dstStep)); + args.push_back(make_pair(sizeof(cl_mem), (void *)&mat_kernel.data)); + args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholecols)); + args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholerows)); + args.push_back(make_pair(sizeof(cl_int), (void *)&dstOffset)); openCLExecuteKernel(clCxt, &filtering_morph, kernelName, globalThreads, localThreads, args, -1, -1, compile_option); } @@ -398,9 +412,9 @@ void GPUDilate(const oclMat &src, oclMat &dst, oclMat &mat_kernel, Size &ksize, //Normalize the result by default //float alpha = ksize.height * ksize.width; CV_Assert(src.clCxt == dst.clCxt); - CV_Assert( (src.cols == dst.cols) && - (src.rows == dst.rows) ); - CV_Assert( (src.oclchannels() == dst.oclchannels()) ); + CV_Assert((src.cols == dst.cols) && + (src.rows == dst.rows)); + CV_Assert((src.oclchannels() == dst.oclchannels())); int srcStep = src.step1() / src.oclchannels(); int dstStep = dst.step1() / dst.oclchannels(); @@ -414,19 +428,21 @@ void GPUDilate(const oclMat &src, oclMat &dst, oclMat &mat_kernel, Size &ksize, size_t localThreads[3] = {16, 16, 1}; size_t globalThreads[3] = {(src.cols + localThreads[0]) / localThreads[0] *localThreads[0], (src.rows + localThreads[1]) / localThreads[1] *localThreads[1], 1}; - if(src.type() == CV_8UC1) + if (src.type() == CV_8UC1) { kernelName = "morph_C1_D0"; globalThreads[0] = ((src.cols + 3) / 4 + localThreads[0]) / localThreads[0] * localThreads[0]; - CV_Assert( localThreads[0]*localThreads[1] * 8 >= (localThreads[0] * 4 + ksize.width - 1) * (localThreads[1] + ksize.height - 1) ); + CV_Assert(localThreads[0]*localThreads[1] * 8 >= (localThreads[0] * 4 + ksize.width - 1) * (localThreads[1] + ksize.height - 1)); } else { kernelName = "morph"; - CV_Assert( localThreads[0]*localThreads[1] * 2 >= (localThreads[0] + ksize.width - 1) * (localThreads[1] + ksize.height - 1) ); + CV_Assert(localThreads[0]*localThreads[1] * 2 >= (localThreads[0] + ksize.width - 1) * (localThreads[1] + ksize.height - 1)); } + char s[64]; - switch(src.type()) + + switch (src.type()) { case CV_8UC1: sprintf(s, "-D VAL=0"); @@ -445,21 +461,22 @@ void GPUDilate(const oclMat &src, oclMat &dst, oclMat &mat_kernel, Size &ksize, default: CV_Error(CV_StsUnsupportedFormat, "unsupported type"); } + char compile_option[128]; sprintf(compile_option, "-D RADIUSX=%d -D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D DILATE %s", anchor.x, anchor.y, localThreads[0], localThreads[1], s); vector< pair > args; - args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data)); - args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data)); - args.push_back( make_pair( sizeof(cl_int), (void *)&srcOffset_x)); - args.push_back( make_pair( sizeof(cl_int), (void *)&srcOffset_y)); - args.push_back( make_pair( sizeof(cl_int), (void *)&src.cols)); - args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows)); - args.push_back( make_pair( sizeof(cl_int), (void *)&srcStep)); - args.push_back( make_pair( sizeof(cl_int), (void *)&dstStep)); - args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_kernel.data)); - args.push_back( make_pair( sizeof(cl_int), (void *)&src.wholecols)); - args.push_back( make_pair( sizeof(cl_int), (void *)&src.wholerows)); - args.push_back( make_pair( sizeof(cl_int), (void *)&dstOffset)); + args.push_back(make_pair(sizeof(cl_mem), (void *)&src.data)); + args.push_back(make_pair(sizeof(cl_mem), (void *)&dst.data)); + args.push_back(make_pair(sizeof(cl_int), (void *)&srcOffset_x)); + args.push_back(make_pair(sizeof(cl_int), (void *)&srcOffset_y)); + args.push_back(make_pair(sizeof(cl_int), (void *)&src.cols)); + args.push_back(make_pair(sizeof(cl_int), (void *)&src.rows)); + args.push_back(make_pair(sizeof(cl_int), (void *)&srcStep)); + args.push_back(make_pair(sizeof(cl_int), (void *)&dstStep)); + args.push_back(make_pair(sizeof(cl_mem), (void *)&mat_kernel.data)); + args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholecols)); + args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholerows)); + args.push_back(make_pair(sizeof(cl_int), (void *)&dstOffset)); openCLExecuteKernel(clCxt, &filtering_morph, kernelName, globalThreads, localThreads, args, -1, -1, compile_option); } @@ -492,6 +509,7 @@ public: virtual void apply(const oclMat &src, oclMat &dst) { Filter2DEngine_GPU::apply(src, dst); + //if (iters > 1) //{ // Size wholesize; @@ -506,7 +524,7 @@ public: // //Filter2DEngine_GPU::apply(dst, morfBuf); // //morfBuf.copyTo(dst); //} - for(int i = 1; i < iters; ++i) + for (int i = 1; i < iters; ++i) { //dst.swap(morfBuf); Size wholesize; @@ -541,16 +559,17 @@ namespace { void morphOp(int op, const oclMat &src, oclMat &dst, const Mat &_kernel, Point anchor, int iterations, int borderType, const Scalar &borderValue) { - if((borderType != cv::BORDER_CONSTANT) || (borderValue != morphologyDefaultBorderValue())) + if ((borderType != cv::BORDER_CONSTANT) || (borderValue != morphologyDefaultBorderValue())) { CV_Error(CV_StsBadArg, "unsupported border type"); } + Mat kernel; Size ksize = _kernel.data ? _kernel.size() : Size(3, 3); normalizeAnchor(anchor, ksize); - if (iterations == 0 || _kernel.rows * _kernel.cols == 1) + if (iterations == 0 || _kernel.rows *_kernel.cols == 1) { src.copyTo(dst); return; @@ -572,7 +591,9 @@ void morphOp(int op, const oclMat &src, oclMat &dst, const Mat &_kernel, Point a iterations = 1; } else + { kernel = _kernel; + } Ptr f = createMorphologyFilter_GPU(op, src.type(), kernel, anchor, iterations); @@ -580,63 +601,69 @@ void morphOp(int op, const oclMat &src, oclMat &dst, const Mat &_kernel, Point a } } -void cv::ocl::erode( const oclMat &src, oclMat &dst, const Mat &kernel, Point anchor, int iterations, - int borderType, const Scalar &borderValue) +void cv::ocl::erode(const oclMat &src, oclMat &dst, const Mat &kernel, Point anchor, int iterations, + int borderType, const Scalar &borderValue) { bool allZero = true; - for(int i = 0; i < kernel.rows * kernel.cols; ++i) - if(kernel.data[i] != 0) + + for (int i = 0; i < kernel.rows * kernel.cols; ++i) + if (kernel.data[i] != 0) + { allZero = false; - if(allZero) + } + + if (allZero) { kernel.data[0] = 1; } + morphOp(MORPH_ERODE, src, dst, kernel, anchor, iterations, borderType, borderValue); } -void cv::ocl::dilate( const oclMat &src, oclMat &dst, const Mat &kernel, Point anchor, int iterations, - int borderType, const Scalar &borderValue) +void cv::ocl::dilate(const oclMat &src, oclMat &dst, const Mat &kernel, Point anchor, int iterations, + int borderType, const Scalar &borderValue) { morphOp(MORPH_DILATE, src, dst, kernel, anchor, iterations, borderType, borderValue); } -void cv::ocl::morphologyEx( const oclMat &src, oclMat &dst, int op, const Mat &kernel, Point anchor, int iterations, - int borderType, const Scalar &borderValue) +void cv::ocl::morphologyEx(const oclMat &src, oclMat &dst, int op, const Mat &kernel, Point anchor, int iterations, + int borderType, const Scalar &borderValue) { oclMat temp; - switch( op ) + + switch (op) { case MORPH_ERODE: - erode( src, dst, kernel, anchor, iterations, borderType, borderValue); + erode(src, dst, kernel, anchor, iterations, borderType, borderValue); break; case MORPH_DILATE: - dilate( src, dst, kernel, anchor, iterations, borderType, borderValue); + dilate(src, dst, kernel, anchor, iterations, borderType, borderValue); break; case MORPH_OPEN: - erode( src, temp, kernel, anchor, iterations, borderType, borderValue); - dilate( temp, dst, kernel, anchor, iterations, borderType, borderValue); + erode(src, temp, kernel, anchor, iterations, borderType, borderValue); + dilate(temp, dst, kernel, anchor, iterations, borderType, borderValue); break; case CV_MOP_CLOSE: - dilate( src, temp, kernel, anchor, iterations, borderType, borderValue); - erode( temp, dst, kernel, anchor, iterations, borderType, borderValue); + dilate(src, temp, kernel, anchor, iterations, borderType, borderValue); + erode(temp, dst, kernel, anchor, iterations, borderType, borderValue); break; case CV_MOP_GRADIENT: - erode( src, temp, kernel, anchor, iterations, borderType, borderValue); - dilate( src, dst, kernel, anchor, iterations, borderType, borderValue); + erode(src, temp, kernel, anchor, iterations, borderType, borderValue); + dilate(src, dst, kernel, anchor, iterations, borderType, borderValue); subtract(dst, temp, dst); break; case CV_MOP_TOPHAT: - erode( src, dst, kernel, anchor, iterations, borderType, borderValue); - dilate( dst, temp, kernel, anchor, iterations, borderType, borderValue); + erode(src, dst, kernel, anchor, iterations, borderType, borderValue); + dilate(dst, temp, kernel, anchor, iterations, borderType, borderValue); subtract(src, temp, dst); break; case CV_MOP_BLACKHAT: - dilate( src, dst, kernel, anchor, iterations, borderType, borderValue); - erode( dst, temp, kernel, anchor, iterations, borderType, borderValue); + dilate(src, dst, kernel, anchor, iterations, borderType, borderValue); + erode(dst, temp, kernel, anchor, iterations, borderType, borderValue); subtract(temp, src, dst); break; default: - CV_Error( CV_StsBadArg, "unknown morphological operation" ); + CV_Error(CV_StsBadArg, "unknown morphological operation"); } } @@ -668,10 +695,10 @@ void GPUFilter2D(const oclMat &src, oclMat &dst, oclMat &mat_kernel, Size &ksize, const Point anchor, const int borderType) { CV_Assert(src.clCxt == dst.clCxt); - CV_Assert( (src.cols == dst.cols) && - (src.rows == dst.rows) ); - CV_Assert( (src.oclchannels() == dst.oclchannels()) ); - CV_Assert( (borderType != 0) ); + CV_Assert((src.cols == dst.cols) && + (src.rows == dst.rows)); + CV_Assert((src.oclchannels() == dst.oclchannels())); + CV_Assert((borderType != 0)); CV_Assert(ksize.height > 0 && ksize.width > 0 && ((ksize.height & 1) == 1) && ((ksize.width & 1) == 1)); CV_Assert((anchor.x == -1 && anchor.y == -1) || (anchor.x == ksize.width >> 1 && anchor.y == ksize.height >> 1)); Context *clCxt = src.clCxt; @@ -703,20 +730,20 @@ void GPUFilter2D(const oclMat &src, oclMat &dst, oclMat &mat_kernel, }; vector< pair > args; - args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data)); - args.push_back( make_pair( sizeof(cl_int), (void *)&src.step)); - args.push_back( make_pair( sizeof(cl_int), (void *)&src_offset_x)); - args.push_back( make_pair( sizeof(cl_int), (void *)&src_offset_y)); - args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data)); - args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step)); - args.push_back( make_pair( sizeof(cl_int), (void *)&dst_offset_x)); - args.push_back( make_pair( sizeof(cl_int), (void *)&dst_offset_y)); - args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_kernel.data)); - args.push_back( make_pair( sizeof(cl_int), (void *)&src.cols)); - args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows)); - args.push_back( make_pair( sizeof(cl_int), (void *)&cols)); - args.push_back( make_pair( sizeof(cl_int), (void *)&src.wholecols)); - args.push_back( make_pair( sizeof(cl_int), (void *)&src.wholerows)); + args.push_back(make_pair(sizeof(cl_mem), (void *)&src.data)); + args.push_back(make_pair(sizeof(cl_int), (void *)&src.step)); + args.push_back(make_pair(sizeof(cl_int), (void *)&src_offset_x)); + args.push_back(make_pair(sizeof(cl_int), (void *)&src_offset_y)); + args.push_back(make_pair(sizeof(cl_mem), (void *)&dst.data)); + args.push_back(make_pair(sizeof(cl_int), (void *)&dst.step)); + args.push_back(make_pair(sizeof(cl_int), (void *)&dst_offset_x)); + args.push_back(make_pair(sizeof(cl_int), (void *)&dst_offset_y)); + args.push_back(make_pair(sizeof(cl_mem), (void *)&mat_kernel.data)); + args.push_back(make_pair(sizeof(cl_int), (void *)&src.cols)); + args.push_back(make_pair(sizeof(cl_int), (void *)&src.rows)); + args.push_back(make_pair(sizeof(cl_int), (void *)&cols)); + args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholecols)); + args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholerows)); openCLExecuteKernel(clCxt, &filtering_laplacian, kernelName, globalThreads, localThreads, args, cn, depth); } @@ -750,8 +777,10 @@ Ptr cv::ocl::createLinearFilter_GPU(int srcType, int dstType, void cv::ocl::filter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &kernel, Point anchor, int borderType) { - if( ddepth < 0 ) + if (ddepth < 0) + { ddepth = src.depth(); + } dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels())); @@ -782,10 +811,10 @@ public: int cn = src.oclchannels(); //dst.create(src_size, src_type); - dst = Scalar(0.0); + //dst = Scalar(0.0); //dstBuf.create(src_size, src_type); dstBuf.create(src_size.height + ksize.height - 1, src_size.width, CV_MAKETYPE(CV_32F, cn)); - dstBuf = Scalar(0.0); + //dstBuf = Scalar(0.0); normalizeROI(roi, ksize, anchor, src_size); @@ -829,13 +858,14 @@ void GPUFilterBox_8u_C1R(const oclMat &src, oclMat &dst, CV_Assert(src.clCxt == dst.clCxt); CV_Assert((src.cols == dst.cols) && - (src.rows == dst.rows) ); + (src.rows == dst.rows)); Context *clCxt = src.clCxt; string kernelName = "boxFilter_C1_D0"; char btype[30]; - switch(borderType) + + switch (borderType) { case 0: sprintf(btype, "BORDER_CONSTANT"); @@ -890,13 +920,14 @@ void GPUFilterBox_8u_C4R(const oclMat &src, oclMat &dst, CV_Assert(src.clCxt == dst.clCxt); CV_Assert((src.cols == dst.cols) && - (src.rows == dst.rows) ); + (src.rows == dst.rows)); Context *clCxt = src.clCxt; string kernelName = "boxFilter_C4_D0"; char btype[30]; - switch(borderType) + + switch (borderType) { case 0: sprintf(btype, "BORDER_CONSTANT"); @@ -951,13 +982,14 @@ void GPUFilterBox_32F_C1R(const oclMat &src, oclMat &dst, CV_Assert(src.clCxt == dst.clCxt); CV_Assert((src.cols == dst.cols) && - (src.rows == dst.rows) ); + (src.rows == dst.rows)); Context *clCxt = src.clCxt; string kernelName = "boxFilter_C1_D5"; char btype[30]; - switch(borderType) + + switch (borderType) { case 0: sprintf(btype, "BORDER_CONSTANT"); @@ -1013,13 +1045,14 @@ void GPUFilterBox_32F_C4R(const oclMat &src, oclMat &dst, CV_Assert(src.clCxt == dst.clCxt); CV_Assert((src.cols == dst.cols) && - (src.rows == dst.rows) ); + (src.rows == dst.rows)); Context *clCxt = src.clCxt; string kernelName = "boxFilter_C4_D5"; char btype[30]; - switch(borderType) + + switch (borderType) { case 0: sprintf(btype, "BORDER_CONSTANT"); @@ -1095,8 +1128,11 @@ void cv::ocl::boxFilter(const oclMat &src, oclMat &dst, int ddepth, Size ksize, Point anchor, int borderType) { int sdepth = src.depth(), cn = src.channels(); - if( ddepth < 0 ) + + if (ddepth < 0) + { ddepth = sdepth; + } dst.create(src.size(), CV_MAKETYPE(ddepth, cn)); @@ -1161,7 +1197,8 @@ void linearRowFilter_gpu(const oclMat &src, const oclMat &dst, oclMat mat_kernel string kernelName = "row_filter"; char btype[30]; - switch(bordertype) + + switch (bordertype) { case 0: sprintf(btype, "BORDER_CONSTANT"); @@ -1179,15 +1216,17 @@ void linearRowFilter_gpu(const oclMat &src, const oclMat &dst, oclMat mat_kernel sprintf(btype, "BORDER_REFLECT_101"); break; } + char compile_option[128]; sprintf(compile_option, "-D RADIUSX=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s", anchor, localThreads[0], localThreads[1], channels, btype); size_t globalThreads[3]; globalThreads[1] = (dst.rows + localThreads[1] - 1) / localThreads[1] * localThreads[1]; globalThreads[2] = (1 + localThreads[2] - 1) / localThreads[2] * localThreads[2]; - if(src.depth() == CV_8U) + + if (src.depth() == CV_8U) { - switch(channels) + switch (channels) { case 1: case 3: @@ -1205,6 +1244,7 @@ void linearRowFilter_gpu(const oclMat &src, const oclMat &dst, oclMat mat_kernel { globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0]; } + //sanity checks CV_Assert(clCxt == dst.clCxt); CV_Assert(src.cols == dst.cols); @@ -1232,7 +1272,7 @@ void linearRowFilter_gpu(const oclMat &src, const oclMat &dst, oclMat mat_kernel args.push_back(make_pair(sizeof(cl_int), (void *)&ridusy)); args.push_back(make_pair(sizeof(cl_mem), (void *)&mat_kernel.data)); - openCLExecuteKernel(clCxt, &filter_sep_row, kernelName, globalThreads, localThreads, args, channels, src.depth(), compile_option); + openCLExecuteKernel2(clCxt, &filter_sep_row, kernelName, globalThreads, localThreads, args, channels, src.depth(), compile_option, CLFLUSH); } Ptr cv::ocl::getLinearRowFilter_GPU(int srcType, int /*bufType*/, const Mat &rowKernel, int anchor, int bordertype) @@ -1289,7 +1329,8 @@ void linearColumnFilter_gpu(const oclMat &src, const oclMat &dst, oclMat mat_ker string kernelName = "col_filter"; char btype[30]; - switch(bordertype) + + switch (bordertype) { case 0: sprintf(btype, "BORDER_CONSTANT"); @@ -1307,15 +1348,17 @@ void linearColumnFilter_gpu(const oclMat &src, const oclMat &dst, oclMat mat_ker sprintf(btype, "BORDER_REFLECT_101"); break; } + char compile_option[256]; size_t globalThreads[3]; globalThreads[1] = (dst.rows + localThreads[1] - 1) / localThreads[1] * localThreads[1]; globalThreads[2] = (1 + localThreads[2] - 1) / localThreads[2] * localThreads[2]; - if(dst.depth() == CV_8U) + + if (dst.depth() == CV_8U) { - switch(channels) + switch (channels) { case 1: globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0]; @@ -1338,7 +1381,8 @@ void linearColumnFilter_gpu(const oclMat &src, const oclMat &dst, oclMat mat_ker else { globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0]; - switch(dst.type()) + + switch (dst.type()) { case CV_32SC1: sprintf(compile_option, "-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s", @@ -1441,20 +1485,25 @@ Ptr cv::ocl::createSeparableLinearFilter_GPU(int srcType, int void cv::ocl::sepFilter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &kernelX, const Mat &kernelY, Point anchor, double delta, int bordertype) { - if((dst.cols != dst.wholecols) || (dst.rows != dst.wholerows)) //has roi + if ((dst.cols != dst.wholecols) || (dst.rows != dst.wholerows)) //has roi { - if((bordertype & cv::BORDER_ISOLATED) != 0) + if ((bordertype & cv::BORDER_ISOLATED) != 0) { bordertype &= ~cv::BORDER_ISOLATED; - if((bordertype != cv::BORDER_CONSTANT) && + + if ((bordertype != cv::BORDER_CONSTANT) && (bordertype != cv::BORDER_REPLICATE)) { CV_Error(CV_StsBadArg, "unsupported border type"); } } } - if( ddepth < 0 ) + + if (ddepth < 0) + { ddepth = src.depth(); + } + //CV_Assert(ddepth == src.depth()); dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels())); @@ -1462,12 +1511,12 @@ void cv::ocl::sepFilter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat f->apply(src, dst); } -Ptr cv::ocl::createDerivFilter_GPU( int srcType, int dstType, int dx, int dy, int ksize, int borderType ) +Ptr cv::ocl::createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, int borderType) { Mat kx, ky; - getDerivKernels( kx, ky, dx, dy, ksize, false, CV_32F ); + getDerivKernels(kx, ky, dx, dy, ksize, false, CV_32F); return createSeparableLinearFilter_GPU(srcType, dstType, - kx, ky, Point(-1, -1), 0, borderType ); + kx, ky, Point(-1, -1), 0, borderType); } //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -1482,15 +1531,20 @@ void cv::ocl::Sobel(const oclMat &src, oclMat &dst, int ddepth, int dx, int dy, // usually the smoothing part is the slowest to compute, // so try to scale it instead of the faster differenciating part if (dx == 0) + { kx *= scale; + } else + { ky *= scale; + } } + // Mat kx_, ky_; //ky.convertTo(ky_,CV_32S,1<<8); //kx.convertTo(kx_,CV_32S,1<<8); - sepFilter2D(src, dst, ddepth, kx, ky, Point(-1, -1), delta, borderType ); + sepFilter2D(src, dst, ddepth, kx, ky, Point(-1, -1), delta, borderType); } void cv::ocl::Scharr(const oclMat &src, oclMat &dst, int ddepth, int dx, int dy, double scale, double delta , int bordertype) @@ -1498,14 +1552,18 @@ void cv::ocl::Scharr(const oclMat &src, oclMat &dst, int ddepth, int dx, int dy, Mat kx, ky; getDerivKernels(kx, ky, dx, dy, -1, false, CV_32F); - if( scale != 1 ) + if (scale != 1) { // usually the smoothing part is the slowest to compute, // so try to scale it instead of the faster differenciating part - if( dx == 0 ) + if (dx == 0) + { kx *= scale; + } else + { ky *= scale; + } } // Mat kx_, ky_; @@ -1517,7 +1575,7 @@ void cv::ocl::Scharr(const oclMat &src, oclMat &dst, int ddepth, int dx, int dy, void cv::ocl::Laplacian(const oclMat &src, oclMat &dst, int ddepth, int ksize, double scale) { - if(src.clCxt -> impl -> double_support == 0 && src.type() == CV_64F) + if (src.clCxt -> impl -> double_support == 0 && src.type() == CV_64F) { CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n"); return; @@ -1531,8 +1589,12 @@ void cv::ocl::Laplacian(const oclMat &src, oclMat &dst, int ddepth, int ksize, d {2, 0, 2, 0, -8, 0, 2, 0, 2} }; Mat kernel(3, 3, CV_32S, (void *)K[ksize == 3]); + if (scale != 1) + { kernel *= scale; + } + filter2D(src, dst, ddepth, kernel, Point(-1, -1)); } @@ -1544,25 +1606,38 @@ Ptr cv::ocl::createGaussianFilter_GPU(int type, Size ksize, do int depth = CV_MAT_DEPTH(type); if (sigma2 <= 0) + { sigma2 = sigma1; + } // automatic detection of kernel size from sigma if (ksize.width <= 0 && sigma1 > 0) + { ksize.width = cvRound(sigma1 * (depth == CV_8U ? 3 : 4) * 2 + 1) | 1; - if (ksize.height <= 0 && sigma2 > 0) - ksize.height = cvRound(sigma2 * (depth == CV_8U ? 3 : 4) * 2 + 1) | 1; + } - CV_Assert( ksize.width > 0 && ksize.width % 2 == 1 && ksize.height > 0 && ksize.height % 2 == 1 ); + if (ksize.height <= 0 && sigma2 > 0) + { + ksize.height = cvRound(sigma2 * (depth == CV_8U ? 3 : 4) * 2 + 1) | 1; + } + + CV_Assert(ksize.width > 0 && ksize.width % 2 == 1 && ksize.height > 0 && ksize.height % 2 == 1); sigma1 = std::max(sigma1, 0.0); sigma2 = std::max(sigma2, 0.0); - Mat kx = getGaussianKernel( ksize.width, sigma1, std::max(depth, CV_32F) ); + Mat kx = getGaussianKernel(ksize.width, sigma1, std::max(depth, CV_32F)); Mat ky; - if( ksize.height == ksize.width && std::abs(sigma1 - sigma2) < DBL_EPSILON ) + + if (ksize.height == ksize.width && std::abs(sigma1 - sigma2) < DBL_EPSILON) + { ky = kx; + } else - ky = getGaussianKernel( ksize.height, sigma2, std::max(depth, CV_32F) ); + { + ky = getGaussianKernel(ksize.height, sigma2, std::max(depth, CV_32F)); + } + //Mat kx_, ky_; //kx.convertTo(kx_,CV_32S,1<<8); //ky.convertTo(ky_,CV_32S,1<<8); @@ -1576,26 +1651,36 @@ void cv::ocl::GaussianBlur(const oclMat &src, oclMat &dst, Size ksize, double si src.copyTo(dst); return; } - if((dst.cols != dst.wholecols) || (dst.rows != dst.wholerows)) //has roi + + if ((dst.cols != dst.wholecols) || (dst.rows != dst.wholerows)) //has roi { - if((bordertype & cv::BORDER_ISOLATED) != 0) + if ((bordertype & cv::BORDER_ISOLATED) != 0) { bordertype &= ~cv::BORDER_ISOLATED; - if((bordertype != cv::BORDER_CONSTANT) && + + if ((bordertype != cv::BORDER_CONSTANT) && (bordertype != cv::BORDER_REPLICATE)) { CV_Error(CV_StsBadArg, "unsupported border type"); } } } + dst.create(src.size(), src.type()); - if( bordertype != BORDER_CONSTANT ) + + if (bordertype != BORDER_CONSTANT) { - if( src.rows == 1 ) + if (src.rows == 1) + { ksize.height = 1; - if( src.cols == 1 ) + } + + if (src.cols == 1) + { ksize.width = 1; + } } + Ptr f = createGaussianFilter_GPU(src.type(), ksize, sigma1, sigma2, bordertype); f->apply(src, dst); }