Merge pull request #1717 from alalek:ocl_adjust_worksize

This commit is contained in:
Andrey Pavlenko 2013-10-31 00:20:34 +04:00 committed by OpenCV Buildbot
commit 089cf423a0
3 changed files with 227 additions and 171 deletions

View File

@ -103,7 +103,11 @@ CV_EXPORTS cl_kernel openCLGetKernelFromSource(const Context *clCxt,
const cv::ocl::ProgramEntry* source, std::string kernelName); const cv::ocl::ProgramEntry* source, std::string kernelName);
CV_EXPORTS cl_kernel openCLGetKernelFromSource(const Context *clCxt, CV_EXPORTS cl_kernel openCLGetKernelFromSource(const Context *clCxt,
const cv::ocl::ProgramEntry* source, std::string kernelName, const char *build_options); const cv::ocl::ProgramEntry* source, std::string kernelName, const char *build_options);
CV_EXPORTS cl_kernel openCLGetKernelFromSource(Context *ctx, const cv::ocl::ProgramEntry* source,
string kernelName, int channels, int depth, const char *build_options);
CV_EXPORTS void openCLVerifyKernel(const Context *clCxt, cl_kernel kernel, size_t *localThreads); CV_EXPORTS void openCLVerifyKernel(const Context *clCxt, cl_kernel kernel, size_t *localThreads);
CV_EXPORTS void openCLExecuteKernel(Context *ctx, cl_kernel kernel, size_t globalThreads[3],
size_t localThreads[3], std::vector< std::pair<size_t, const void *> > &args);
CV_EXPORTS void openCLExecuteKernel(Context *clCxt , const cv::ocl::ProgramEntry* source, string kernelName, std::vector< std::pair<size_t, const void *> > &args, CV_EXPORTS void openCLExecuteKernel(Context *clCxt , const cv::ocl::ProgramEntry* source, string kernelName, std::vector< std::pair<size_t, const void *> > &args,
int globalcols , int globalrows, size_t blockSize = 16, int kernel_expand_depth = -1, int kernel_expand_channel = -1); int globalcols , int globalrows, size_t blockSize = 16, int kernel_expand_depth = -1, int kernel_expand_channel = -1);
CV_EXPORTS void openCLExecuteKernel_(Context *clCxt, const cv::ocl::ProgramEntry* source, std::string kernelName, CV_EXPORTS void openCLExecuteKernel_(Context *clCxt, const cv::ocl::ProgramEntry* source, std::string kernelName,

View File

@ -336,8 +336,7 @@ static std::string removeDuplicatedWhiteSpaces(const char * buildOptions)
return opt; return opt;
} }
void openCLExecuteKernel_(Context *ctx, const cv::ocl::ProgramEntry* source, string kernelName, size_t globalThreads[3], cl_kernel openCLGetKernelFromSource(Context *ctx, const cv::ocl::ProgramEntry* source, string kernelName, int channels,
size_t localThreads[3], vector< pair<size_t, const void *> > &args, int channels,
int depth, const char *build_options) int depth, const char *build_options)
{ {
//construct kernel name //construct kernel name
@ -350,10 +349,14 @@ void openCLExecuteKernel_(Context *ctx, const cv::ocl::ProgramEntry* source, str
idxStr << "_D" << depth; idxStr << "_D" << depth;
kernelName += idxStr.str(); kernelName += idxStr.str();
cl_kernel kernel;
std::string fixedOptions = removeDuplicatedWhiteSpaces(build_options); std::string fixedOptions = removeDuplicatedWhiteSpaces(build_options);
kernel = openCLGetKernelFromSource(ctx, source, kernelName, fixedOptions.c_str()); cl_kernel kernel = openCLGetKernelFromSource(ctx, source, kernelName, fixedOptions.c_str());
return kernel;
}
void openCLExecuteKernel(Context *ctx, cl_kernel kernel, size_t globalThreads[3],
size_t localThreads[3], vector< pair<size_t, const void *> > &args)
{
if ( localThreads != NULL) if ( localThreads != NULL)
{ {
globalThreads[0] = roundUp(globalThreads[0], localThreads[0]); globalThreads[0] = roundUp(globalThreads[0], localThreads[0]);
@ -399,6 +402,15 @@ void openCLExecuteKernel_(Context *ctx, const cv::ocl::ProgramEntry* source, str
openCLSafeCall(clReleaseKernel(kernel)); openCLSafeCall(clReleaseKernel(kernel));
} }
void openCLExecuteKernel_(Context *ctx, const cv::ocl::ProgramEntry* source, string kernelName, size_t globalThreads[3],
size_t localThreads[3], vector< pair<size_t, const void *> > &args, int channels,
int depth, const char *build_options)
{
cl_kernel kernel = openCLGetKernelFromSource(ctx, source, kernelName, channels, depth, build_options);
openCLExecuteKernel(ctx, kernel, globalThreads, localThreads, args);
}
void openCLExecuteKernel(Context *ctx, const cv::ocl::ProgramEntry* source, string kernelName, void openCLExecuteKernel(Context *ctx, const cv::ocl::ProgramEntry* source, string kernelName,
size_t globalThreads[3], size_t localThreads[3], size_t globalThreads[3], size_t localThreads[3],
vector< pair<size_t, const void *> > &args, int channels, int depth) vector< pair<size_t, const void *> > &args, int channels, int depth)

View File

@ -578,104 +578,124 @@ static void GPUFilter2D(const oclMat &src, oclMat &dst, const Mat &kernel,
kernelDataFloat.size()*sizeof(float), 1, clMemcpyHostToDevice); kernelDataFloat.size()*sizeof(float), 1, clMemcpyHostToDevice);
} }
size_t BLOCK_SIZE = src.clCxt->getDeviceInfo().maxWorkItemSizes[0]; size_t tryWorkItems = src.clCxt->getDeviceInfo().maxWorkItemSizes[0];
do {
size_t BLOCK_SIZE = tryWorkItems;
while (BLOCK_SIZE > 32 && BLOCK_SIZE >= (size_t)ksize.width * 2 && BLOCK_SIZE > (size_t)src.cols * 2)
BLOCK_SIZE /= 2;
#if 1 // TODO Mode with several blocks requires a much more VGPRs, so this optimization is not actual for the current devices #if 1 // TODO Mode with several blocks requires a much more VGPRs, so this optimization is not actual for the current devices
size_t BLOCK_SIZE_Y = 1; size_t BLOCK_SIZE_Y = 1;
#else #else
size_t BLOCK_SIZE_Y = 8; // TODO Check heuristic value on devices size_t BLOCK_SIZE_Y = 8; // TODO Check heuristic value on devices
while (BLOCK_SIZE_Y < BLOCK_SIZE / 8 && BLOCK_SIZE_Y * src.clCxt->getDeviceInfo().maxComputeUnits * 32 < (size_t)src.rows) while (BLOCK_SIZE_Y < BLOCK_SIZE / 8 && BLOCK_SIZE_Y * src.clCxt->getDeviceInfo().maxComputeUnits * 32 < (size_t)src.rows)
BLOCK_SIZE_Y *= 2; BLOCK_SIZE_Y *= 2;
#endif #endif
CV_Assert((size_t)ksize.width <= BLOCK_SIZE); CV_Assert((size_t)ksize.width <= BLOCK_SIZE);
bool isIsolatedBorder = (borderType & BORDER_ISOLATED) != 0; bool isIsolatedBorder = (borderType & BORDER_ISOLATED) != 0;
vector<pair<size_t , const void *> > args; vector<pair<size_t , const void *> > args;
args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data)); args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data));
cl_uint stepBytes = src.step; cl_uint stepBytes = src.step;
args.push_back( make_pair( sizeof(cl_uint), (void *)&stepBytes)); args.push_back( make_pair( sizeof(cl_uint), (void *)&stepBytes));
int offsetXBytes = src.offset % src.step; int offsetXBytes = src.offset % src.step;
int offsetX = offsetXBytes / src.elemSize(); int offsetX = offsetXBytes / src.elemSize();
CV_Assert((int)(offsetX * src.elemSize()) == offsetXBytes); CV_Assert((int)(offsetX * src.elemSize()) == offsetXBytes);
int offsetY = src.offset / src.step; int offsetY = src.offset / src.step;
int endX = (offsetX + src.cols); int endX = (offsetX + src.cols);
int endY = (offsetY + src.rows); int endY = (offsetY + src.rows);
cl_int rect[4] = {offsetX, offsetY, endX, endY}; cl_int rect[4] = {offsetX, offsetY, endX, endY};
if (!isIsolatedBorder) if (!isIsolatedBorder)
{ {
rect[2] = src.wholecols; rect[2] = src.wholecols;
rect[3] = src.wholerows; rect[3] = src.wholerows;
} }
args.push_back( make_pair( sizeof(cl_int)*4, (void *)&rect[0])); args.push_back( make_pair( sizeof(cl_int)*4, (void *)&rect[0]));
args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data)); args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data));
cl_uint _stepBytes = dst.step; cl_uint _stepBytes = dst.step;
args.push_back( make_pair( sizeof(cl_uint), (void *)&_stepBytes)); args.push_back( make_pair( sizeof(cl_uint), (void *)&_stepBytes));
int _offsetXBytes = dst.offset % dst.step; int _offsetXBytes = dst.offset % dst.step;
int _offsetX = _offsetXBytes / dst.elemSize(); int _offsetX = _offsetXBytes / dst.elemSize();
CV_Assert((int)(_offsetX * dst.elemSize()) == _offsetXBytes); CV_Assert((int)(_offsetX * dst.elemSize()) == _offsetXBytes);
int _offsetY = dst.offset / dst.step; int _offsetY = dst.offset / dst.step;
int _endX = (_offsetX + dst.cols); int _endX = (_offsetX + dst.cols);
int _endY = (_offsetY + dst.rows); int _endY = (_offsetY + dst.rows);
cl_int _rect[4] = {_offsetX, _offsetY, _endX, _endY}; cl_int _rect[4] = {_offsetX, _offsetY, _endX, _endY};
args.push_back( make_pair( sizeof(cl_int)*4, (void *)&_rect[0])); args.push_back( make_pair( sizeof(cl_int)*4, (void *)&_rect[0]));
float borderValue[4] = {0, 0, 0, 0}; // DON'T move into 'if' body float borderValue[4] = {0, 0, 0, 0}; // DON'T move into 'if' body
double borderValueDouble[4] = {0, 0, 0, 0}; // DON'T move into 'if' body double borderValueDouble[4] = {0, 0, 0, 0}; // DON'T move into 'if' body
if ((borderType & ~BORDER_ISOLATED) == BORDER_CONSTANT) if ((borderType & ~BORDER_ISOLATED) == BORDER_CONSTANT)
{ {
if (useDouble) if (useDouble)
args.push_back( make_pair( sizeof(double) * src.oclchannels(), (void *)&borderValue[0])); args.push_back( make_pair( sizeof(double) * src.oclchannels(), (void *)&borderValue[0]));
else else
args.push_back( make_pair( sizeof(float) * src.oclchannels(), (void *)&borderValueDouble[0])); args.push_back( make_pair( sizeof(float) * src.oclchannels(), (void *)&borderValueDouble[0]));
} }
args.push_back( make_pair( sizeof(cl_mem), (void *)&oclKernelParameter.data)); args.push_back( make_pair( sizeof(cl_mem), (void *)&oclKernelParameter.data));
const char* btype = NULL; const char* btype = NULL;
switch (borderType & ~BORDER_ISOLATED) switch (borderType & ~BORDER_ISOLATED)
{ {
case BORDER_CONSTANT: case BORDER_CONSTANT:
btype = "BORDER_CONSTANT"; btype = "BORDER_CONSTANT";
break; break;
case BORDER_REPLICATE: case BORDER_REPLICATE:
btype = "BORDER_REPLICATE"; btype = "BORDER_REPLICATE";
break; break;
case BORDER_REFLECT: case BORDER_REFLECT:
btype = "BORDER_REFLECT"; btype = "BORDER_REFLECT";
break; break;
case BORDER_WRAP: case BORDER_WRAP:
CV_Error(CV_StsUnsupportedFormat, "BORDER_WRAP is not supported!"); CV_Error(CV_StsUnsupportedFormat, "BORDER_WRAP is not supported!");
return; return;
case BORDER_REFLECT101: case BORDER_REFLECT101:
btype = "BORDER_REFLECT_101"; btype = "BORDER_REFLECT_101";
break; break;
} }
int requiredTop = anchor.y; int requiredTop = anchor.y;
int requiredLeft = BLOCK_SIZE; // not this: anchor.x; int requiredLeft = BLOCK_SIZE; // not this: anchor.x;
int requiredBottom = ksize.height - 1 - anchor.y; int requiredBottom = ksize.height - 1 - anchor.y;
int requiredRight = BLOCK_SIZE; // not this: ksize.width - 1 - anchor.x; int requiredRight = BLOCK_SIZE; // not this: ksize.width - 1 - anchor.x;
int h = isIsolatedBorder ? src.rows : src.wholerows; int h = isIsolatedBorder ? src.rows : src.wholerows;
int w = isIsolatedBorder ? src.cols : src.wholecols; int w = isIsolatedBorder ? src.cols : src.wholecols;
bool extra_extrapolation = h < requiredTop || h < requiredBottom || w < requiredLeft || w < requiredRight; bool extra_extrapolation = h < requiredTop || h < requiredBottom || w < requiredLeft || w < requiredRight;
char build_options[1024]; char build_options[1024];
sprintf(build_options, "-D LOCAL_SIZE=%d -D BLOCK_SIZE_Y=%d -D DATA_DEPTH=%d -D DATA_CHAN=%d -D USE_DOUBLE=%d " sprintf(build_options, "-D LOCAL_SIZE=%d -D BLOCK_SIZE_Y=%d -D DATA_DEPTH=%d -D DATA_CHAN=%d -D USE_DOUBLE=%d "
"-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D KERNEL_SIZE_Y2_ALIGNED=%d " "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D KERNEL_SIZE_Y2_ALIGNED=%d "
"-D %s -D %s -D %s", "-D %s -D %s -D %s",
(int)BLOCK_SIZE, (int)BLOCK_SIZE_Y, (int)BLOCK_SIZE, (int)BLOCK_SIZE_Y,
src.depth(), src.oclchannels(), useDouble ? 1 : 0, src.depth(), src.oclchannels(), useDouble ? 1 : 0,
anchor.x, anchor.y, ksize.width, ksize.height, kernel_size_y2_aligned, anchor.x, anchor.y, ksize.width, ksize.height, kernel_size_y2_aligned,
btype, btype,
extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION", extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION",
isIsolatedBorder ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED"); isIsolatedBorder ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED");
size_t gt[3] = {divUp(dst.cols, BLOCK_SIZE - (ksize.width - 1)) * BLOCK_SIZE, divUp(dst.rows, BLOCK_SIZE_Y), 1}, lt[3] = {BLOCK_SIZE, 1, 1}; size_t lt[3] = {BLOCK_SIZE, 1, 1};
openCLExecuteKernel(src.clCxt, &filtering_filter2D, "filter2D", gt, lt, args, -1, -1, build_options); size_t gt[3] = {divUp(dst.cols, BLOCK_SIZE - (ksize.width - 1)) * BLOCK_SIZE, divUp(dst.rows, BLOCK_SIZE_Y), 1};
cl_kernel kernel = openCLGetKernelFromSource(src.clCxt, &filtering_filter2D, "filter2D", -1, -1, build_options);
size_t kernelWorkGroupSize;
openCLSafeCall(clGetKernelWorkGroupInfo(kernel, getClDeviceID(src.clCxt),
CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, 0));
if (lt[0] > kernelWorkGroupSize)
{
clReleaseKernel(kernel);
CV_Assert(BLOCK_SIZE > kernelWorkGroupSize);
tryWorkItems = kernelWorkGroupSize;
continue;
}
openCLExecuteKernel(src.clCxt, kernel, gt, lt, args); // kernel will be released here
} while (false);
} }
Ptr<BaseFilter_GPU> cv::ocl::getLinearFilter_GPU(int /*srcType*/, int /*dstType*/, const Mat &kernel, const Size &ksize, Ptr<BaseFilter_GPU> cv::ocl::getLinearFilter_GPU(int /*srcType*/, int /*dstType*/, const Mat &kernel, const Size &ksize,
@ -770,106 +790,126 @@ static void GPUFilterBox(const oclMat &src, oclMat &dst,
(src.rows == dst.rows)); (src.rows == dst.rows));
CV_Assert(src.oclchannels() == dst.oclchannels()); CV_Assert(src.oclchannels() == dst.oclchannels());
size_t BLOCK_SIZE = src.clCxt->getDeviceInfo().maxWorkItemSizes[0]; size_t tryWorkItems = src.clCxt->getDeviceInfo().maxWorkItemSizes[0];
size_t BLOCK_SIZE_Y = 8; // TODO Check heuristic value on devices do {
while (BLOCK_SIZE_Y < BLOCK_SIZE / 8 && BLOCK_SIZE_Y * src.clCxt->getDeviceInfo().maxComputeUnits * 32 < (size_t)src.rows) size_t BLOCK_SIZE = tryWorkItems;
BLOCK_SIZE_Y *= 2; while (BLOCK_SIZE > 32 && BLOCK_SIZE >= (size_t)ksize.width * 2 && BLOCK_SIZE > (size_t)src.cols * 2)
BLOCK_SIZE /= 2;
size_t BLOCK_SIZE_Y = 8; // TODO Check heuristic value on devices
while (BLOCK_SIZE_Y < BLOCK_SIZE / 8 && BLOCK_SIZE_Y * src.clCxt->getDeviceInfo().maxComputeUnits * 32 < (size_t)src.rows)
BLOCK_SIZE_Y *= 2;
CV_Assert((size_t)ksize.width <= BLOCK_SIZE); CV_Assert((size_t)ksize.width <= BLOCK_SIZE);
bool isIsolatedBorder = (borderType & BORDER_ISOLATED) != 0; bool isIsolatedBorder = (borderType & BORDER_ISOLATED) != 0;
vector<pair<size_t , const void *> > args; vector<pair<size_t , const void *> > args;
args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data)); args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data));
cl_uint stepBytes = src.step; cl_uint stepBytes = src.step;
args.push_back( make_pair( sizeof(cl_uint), (void *)&stepBytes)); args.push_back( make_pair( sizeof(cl_uint), (void *)&stepBytes));
int offsetXBytes = src.offset % src.step; int offsetXBytes = src.offset % src.step;
int offsetX = offsetXBytes / src.elemSize(); int offsetX = offsetXBytes / src.elemSize();
CV_Assert((int)(offsetX * src.elemSize()) == offsetXBytes); CV_Assert((int)(offsetX * src.elemSize()) == offsetXBytes);
int offsetY = src.offset / src.step; int offsetY = src.offset / src.step;
int endX = (offsetX + src.cols); int endX = (offsetX + src.cols);
int endY = (offsetY + src.rows); int endY = (offsetY + src.rows);
cl_int rect[4] = {offsetX, offsetY, endX, endY}; cl_int rect[4] = {offsetX, offsetY, endX, endY};
if (!isIsolatedBorder) if (!isIsolatedBorder)
{ {
rect[2] = src.wholecols; rect[2] = src.wholecols;
rect[3] = src.wholerows; rect[3] = src.wholerows;
} }
args.push_back( make_pair( sizeof(cl_int)*4, (void *)&rect[0])); args.push_back( make_pair( sizeof(cl_int)*4, (void *)&rect[0]));
args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data)); args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data));
cl_uint _stepBytes = dst.step; cl_uint _stepBytes = dst.step;
args.push_back( make_pair( sizeof(cl_uint), (void *)&_stepBytes)); args.push_back( make_pair( sizeof(cl_uint), (void *)&_stepBytes));
int _offsetXBytes = dst.offset % dst.step; int _offsetXBytes = dst.offset % dst.step;
int _offsetX = _offsetXBytes / dst.elemSize(); int _offsetX = _offsetXBytes / dst.elemSize();
CV_Assert((int)(_offsetX * dst.elemSize()) == _offsetXBytes); CV_Assert((int)(_offsetX * dst.elemSize()) == _offsetXBytes);
int _offsetY = dst.offset / dst.step; int _offsetY = dst.offset / dst.step;
int _endX = (_offsetX + dst.cols); int _endX = (_offsetX + dst.cols);
int _endY = (_offsetY + dst.rows); int _endY = (_offsetY + dst.rows);
cl_int _rect[4] = {_offsetX, _offsetY, _endX, _endY}; cl_int _rect[4] = {_offsetX, _offsetY, _endX, _endY};
args.push_back( make_pair( sizeof(cl_int)*4, (void *)&_rect[0])); args.push_back( make_pair( sizeof(cl_int)*4, (void *)&_rect[0]));
bool useDouble = src.depth() == CV_64F; bool useDouble = src.depth() == CV_64F;
float borderValue[4] = {0, 0, 0, 0}; // DON'T move into 'if' body float borderValue[4] = {0, 0, 0, 0}; // DON'T move into 'if' body
double borderValueDouble[4] = {0, 0, 0, 0}; // DON'T move into 'if' body double borderValueDouble[4] = {0, 0, 0, 0}; // DON'T move into 'if' body
if ((borderType & ~BORDER_ISOLATED) == BORDER_CONSTANT) if ((borderType & ~BORDER_ISOLATED) == BORDER_CONSTANT)
{ {
if (useDouble)
args.push_back( make_pair( sizeof(double) * src.oclchannels(), (void *)&borderValue[0]));
else
args.push_back( make_pair( sizeof(float) * src.oclchannels(), (void *)&borderValueDouble[0]));
}
double alphaDouble = alpha; // DON'T move into 'if' body
if (useDouble) if (useDouble)
args.push_back( make_pair( sizeof(double) * src.oclchannels(), (void *)&borderValue[0])); args.push_back( make_pair( sizeof(double), (void *)&alphaDouble));
else else
args.push_back( make_pair( sizeof(float) * src.oclchannels(), (void *)&borderValueDouble[0])); args.push_back( make_pair( sizeof(float), (void *)&alpha));
}
double alphaDouble = alpha; // DON'T move into 'if' body const char* btype = NULL;
if (useDouble)
args.push_back( make_pair( sizeof(double), (void *)&alphaDouble));
else
args.push_back( make_pair( sizeof(float), (void *)&alpha));
const char* btype = NULL; switch (borderType & ~BORDER_ISOLATED)
{
case BORDER_CONSTANT:
btype = "BORDER_CONSTANT";
break;
case BORDER_REPLICATE:
btype = "BORDER_REPLICATE";
break;
case BORDER_REFLECT:
btype = "BORDER_REFLECT";
break;
case BORDER_WRAP:
CV_Error(CV_StsUnsupportedFormat, "BORDER_WRAP is not supported!");
return;
case BORDER_REFLECT101:
btype = "BORDER_REFLECT_101";
break;
}
switch (borderType & ~BORDER_ISOLATED) int requiredTop = anchor.y;
{ int requiredLeft = BLOCK_SIZE; // not this: anchor.x;
case BORDER_CONSTANT: int requiredBottom = ksize.height - 1 - anchor.y;
btype = "BORDER_CONSTANT"; int requiredRight = BLOCK_SIZE; // not this: ksize.width - 1 - anchor.x;
break; int h = isIsolatedBorder ? src.rows : src.wholerows;
case BORDER_REPLICATE: int w = isIsolatedBorder ? src.cols : src.wholecols;
btype = "BORDER_REPLICATE"; bool extra_extrapolation = h < requiredTop || h < requiredBottom || w < requiredLeft || w < requiredRight;
break;
case BORDER_REFLECT:
btype = "BORDER_REFLECT";
break;
case BORDER_WRAP:
CV_Error(CV_StsUnsupportedFormat, "BORDER_WRAP is not supported!");
return;
case BORDER_REFLECT101:
btype = "BORDER_REFLECT_101";
break;
}
int requiredTop = anchor.y; CV_Assert(w >= ksize.width && h >= ksize.height); // TODO Other cases are not tested well
int requiredLeft = BLOCK_SIZE; // not this: anchor.x;
int requiredBottom = ksize.height - 1 - anchor.y;
int requiredRight = BLOCK_SIZE; // not this: ksize.width - 1 - anchor.x;
int h = isIsolatedBorder ? src.rows : src.wholerows;
int w = isIsolatedBorder ? src.cols : src.wholecols;
bool extra_extrapolation = h < requiredTop || h < requiredBottom || w < requiredLeft || w < requiredRight;
CV_Assert(w >= ksize.width && h >= ksize.height); // TODO Other cases are not tested well char build_options[1024];
sprintf(build_options, "-D LOCAL_SIZE=%d -D BLOCK_SIZE_Y=%d -D DATA_DEPTH=%d -D DATA_CHAN=%d -D USE_DOUBLE=%d -D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D %s -D %s -D %s",
(int)BLOCK_SIZE, (int)BLOCK_SIZE_Y,
src.depth(), src.oclchannels(), useDouble ? 1 : 0,
anchor.x, anchor.y, ksize.width, ksize.height,
btype,
extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION",
isIsolatedBorder ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED");
char build_options[1024]; size_t lt[3] = {BLOCK_SIZE, 1, 1};
sprintf(build_options, "-D LOCAL_SIZE=%d -D BLOCK_SIZE_Y=%d -D DATA_DEPTH=%d -D DATA_CHAN=%d -D USE_DOUBLE=%d -D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D %s -D %s -D %s", size_t gt[3] = {divUp(dst.cols, BLOCK_SIZE - (ksize.width - 1)) * BLOCK_SIZE, divUp(dst.rows, BLOCK_SIZE_Y), 1};
(int)BLOCK_SIZE, (int)BLOCK_SIZE_Y,
src.depth(), src.oclchannels(), useDouble ? 1 : 0,
anchor.x, anchor.y, ksize.width, ksize.height,
btype,
extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION",
isIsolatedBorder ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED");
size_t gt[3] = {divUp(dst.cols, BLOCK_SIZE - (ksize.width - 1)) * BLOCK_SIZE, divUp(dst.rows, BLOCK_SIZE_Y), 1}, lt[3] = {BLOCK_SIZE, 1, 1}; cl_kernel kernel = openCLGetKernelFromSource(src.clCxt, &filtering_boxFilter, "boxFilter", -1, -1, build_options);
openCLExecuteKernel(src.clCxt, &filtering_boxFilter, "boxFilter", gt, lt, args, -1, -1, build_options);
size_t kernelWorkGroupSize;
openCLSafeCall(clGetKernelWorkGroupInfo(kernel, getClDeviceID(src.clCxt),
CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, 0));
if (lt[0] > kernelWorkGroupSize)
{
clReleaseKernel(kernel);
CV_Assert(BLOCK_SIZE > kernelWorkGroupSize);
tryWorkItems = kernelWorkGroupSize;
continue;
}
openCLExecuteKernel(src.clCxt, kernel, gt, lt, args); // kernel will be released here
} while (false);
} }
Ptr<BaseFilter_GPU> cv::ocl::getBoxFilter_GPU(int /*srcType*/, int /*dstType*/, Ptr<BaseFilter_GPU> cv::ocl::getBoxFilter_GPU(int /*srcType*/, int /*dstType*/,