ocl: adjust worksize for filter2D and boxFilter
This commit is contained in:
parent
8a4f1bbbdf
commit
7b0f018a74
@ -103,7 +103,11 @@ CV_EXPORTS cl_kernel openCLGetKernelFromSource(const Context *clCxt,
|
|||||||
const cv::ocl::ProgramEntry* source, std::string kernelName);
|
const cv::ocl::ProgramEntry* source, std::string kernelName);
|
||||||
CV_EXPORTS cl_kernel openCLGetKernelFromSource(const Context *clCxt,
|
CV_EXPORTS cl_kernel openCLGetKernelFromSource(const Context *clCxt,
|
||||||
const cv::ocl::ProgramEntry* source, std::string kernelName, const char *build_options);
|
const cv::ocl::ProgramEntry* source, std::string kernelName, const char *build_options);
|
||||||
|
CV_EXPORTS cl_kernel openCLGetKernelFromSource(Context *ctx, const cv::ocl::ProgramEntry* source,
|
||||||
|
string kernelName, int channels, int depth, const char *build_options);
|
||||||
CV_EXPORTS void openCLVerifyKernel(const Context *clCxt, cl_kernel kernel, size_t *localThreads);
|
CV_EXPORTS void openCLVerifyKernel(const Context *clCxt, cl_kernel kernel, size_t *localThreads);
|
||||||
|
CV_EXPORTS void openCLExecuteKernel(Context *ctx, cl_kernel kernel, size_t globalThreads[3],
|
||||||
|
size_t localThreads[3], std::vector< std::pair<size_t, const void *> > &args);
|
||||||
CV_EXPORTS void openCLExecuteKernel(Context *clCxt , const cv::ocl::ProgramEntry* source, string kernelName, std::vector< std::pair<size_t, const void *> > &args,
|
CV_EXPORTS void openCLExecuteKernel(Context *clCxt , const cv::ocl::ProgramEntry* source, string kernelName, std::vector< std::pair<size_t, const void *> > &args,
|
||||||
int globalcols , int globalrows, size_t blockSize = 16, int kernel_expand_depth = -1, int kernel_expand_channel = -1);
|
int globalcols , int globalrows, size_t blockSize = 16, int kernel_expand_depth = -1, int kernel_expand_channel = -1);
|
||||||
CV_EXPORTS void openCLExecuteKernel_(Context *clCxt, const cv::ocl::ProgramEntry* source, std::string kernelName,
|
CV_EXPORTS void openCLExecuteKernel_(Context *clCxt, const cv::ocl::ProgramEntry* source, std::string kernelName,
|
||||||
|
@ -336,8 +336,7 @@ static std::string removeDuplicatedWhiteSpaces(const char * buildOptions)
|
|||||||
return opt;
|
return opt;
|
||||||
}
|
}
|
||||||
|
|
||||||
void openCLExecuteKernel_(Context *ctx, const cv::ocl::ProgramEntry* source, string kernelName, size_t globalThreads[3],
|
cl_kernel openCLGetKernelFromSource(Context *ctx, const cv::ocl::ProgramEntry* source, string kernelName, int channels,
|
||||||
size_t localThreads[3], vector< pair<size_t, const void *> > &args, int channels,
|
|
||||||
int depth, const char *build_options)
|
int depth, const char *build_options)
|
||||||
{
|
{
|
||||||
//construct kernel name
|
//construct kernel name
|
||||||
@ -350,10 +349,14 @@ void openCLExecuteKernel_(Context *ctx, const cv::ocl::ProgramEntry* source, str
|
|||||||
idxStr << "_D" << depth;
|
idxStr << "_D" << depth;
|
||||||
kernelName += idxStr.str();
|
kernelName += idxStr.str();
|
||||||
|
|
||||||
cl_kernel kernel;
|
|
||||||
std::string fixedOptions = removeDuplicatedWhiteSpaces(build_options);
|
std::string fixedOptions = removeDuplicatedWhiteSpaces(build_options);
|
||||||
kernel = openCLGetKernelFromSource(ctx, source, kernelName, fixedOptions.c_str());
|
cl_kernel kernel = openCLGetKernelFromSource(ctx, source, kernelName, fixedOptions.c_str());
|
||||||
|
return kernel;
|
||||||
|
}
|
||||||
|
|
||||||
|
void openCLExecuteKernel(Context *ctx, cl_kernel kernel, size_t globalThreads[3],
|
||||||
|
size_t localThreads[3], vector< pair<size_t, const void *> > &args)
|
||||||
|
{
|
||||||
if ( localThreads != NULL)
|
if ( localThreads != NULL)
|
||||||
{
|
{
|
||||||
globalThreads[0] = roundUp(globalThreads[0], localThreads[0]);
|
globalThreads[0] = roundUp(globalThreads[0], localThreads[0]);
|
||||||
@ -399,6 +402,15 @@ void openCLExecuteKernel_(Context *ctx, const cv::ocl::ProgramEntry* source, str
|
|||||||
openCLSafeCall(clReleaseKernel(kernel));
|
openCLSafeCall(clReleaseKernel(kernel));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void openCLExecuteKernel_(Context *ctx, const cv::ocl::ProgramEntry* source, string kernelName, size_t globalThreads[3],
|
||||||
|
size_t localThreads[3], vector< pair<size_t, const void *> > &args, int channels,
|
||||||
|
int depth, const char *build_options)
|
||||||
|
{
|
||||||
|
cl_kernel kernel = openCLGetKernelFromSource(ctx, source, kernelName, channels, depth, build_options);
|
||||||
|
|
||||||
|
openCLExecuteKernel(ctx, kernel, globalThreads, localThreads, args);
|
||||||
|
}
|
||||||
|
|
||||||
void openCLExecuteKernel(Context *ctx, const cv::ocl::ProgramEntry* source, string kernelName,
|
void openCLExecuteKernel(Context *ctx, const cv::ocl::ProgramEntry* source, string kernelName,
|
||||||
size_t globalThreads[3], size_t localThreads[3],
|
size_t globalThreads[3], size_t localThreads[3],
|
||||||
vector< pair<size_t, const void *> > &args, int channels, int depth)
|
vector< pair<size_t, const void *> > &args, int channels, int depth)
|
||||||
|
@ -578,7 +578,11 @@ static void GPUFilter2D(const oclMat &src, oclMat &dst, const Mat &kernel,
|
|||||||
kernelDataFloat.size()*sizeof(float), 1, clMemcpyHostToDevice);
|
kernelDataFloat.size()*sizeof(float), 1, clMemcpyHostToDevice);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t BLOCK_SIZE = src.clCxt->getDeviceInfo().maxWorkItemSizes[0];
|
size_t tryWorkItems = src.clCxt->getDeviceInfo().maxWorkItemSizes[0];
|
||||||
|
do {
|
||||||
|
size_t BLOCK_SIZE = tryWorkItems;
|
||||||
|
while (BLOCK_SIZE > 32 && BLOCK_SIZE >= (size_t)ksize.width * 2 && BLOCK_SIZE > (size_t)src.cols * 2)
|
||||||
|
BLOCK_SIZE /= 2;
|
||||||
#if 1 // TODO Mode with several blocks requires a much more VGPRs, so this optimization is not actual for the current devices
|
#if 1 // TODO Mode with several blocks requires a much more VGPRs, so this optimization is not actual for the current devices
|
||||||
size_t BLOCK_SIZE_Y = 1;
|
size_t BLOCK_SIZE_Y = 1;
|
||||||
#else
|
#else
|
||||||
@ -674,8 +678,24 @@ static void GPUFilter2D(const oclMat &src, oclMat &dst, const Mat &kernel,
|
|||||||
extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION",
|
extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION",
|
||||||
isIsolatedBorder ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED");
|
isIsolatedBorder ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED");
|
||||||
|
|
||||||
size_t gt[3] = {divUp(dst.cols, BLOCK_SIZE - (ksize.width - 1)) * BLOCK_SIZE, divUp(dst.rows, BLOCK_SIZE_Y), 1}, lt[3] = {BLOCK_SIZE, 1, 1};
|
size_t lt[3] = {BLOCK_SIZE, 1, 1};
|
||||||
openCLExecuteKernel(src.clCxt, &filtering_filter2D, "filter2D", gt, lt, args, -1, -1, build_options);
|
size_t gt[3] = {divUp(dst.cols, BLOCK_SIZE - (ksize.width - 1)) * BLOCK_SIZE, divUp(dst.rows, BLOCK_SIZE_Y), 1};
|
||||||
|
|
||||||
|
cl_kernel kernel = openCLGetKernelFromSource(src.clCxt, &filtering_filter2D, "filter2D", -1, -1, build_options);
|
||||||
|
|
||||||
|
size_t kernelWorkGroupSize;
|
||||||
|
openCLSafeCall(clGetKernelWorkGroupInfo(kernel, getClDeviceID(src.clCxt),
|
||||||
|
CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, 0));
|
||||||
|
if (lt[0] > kernelWorkGroupSize)
|
||||||
|
{
|
||||||
|
clReleaseKernel(kernel);
|
||||||
|
CV_Assert(BLOCK_SIZE > kernelWorkGroupSize);
|
||||||
|
tryWorkItems = kernelWorkGroupSize;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
openCLExecuteKernel(src.clCxt, kernel, gt, lt, args); // kernel will be released here
|
||||||
|
} while (false);
|
||||||
}
|
}
|
||||||
|
|
||||||
Ptr<BaseFilter_GPU> cv::ocl::getLinearFilter_GPU(int /*srcType*/, int /*dstType*/, const Mat &kernel, const Size &ksize,
|
Ptr<BaseFilter_GPU> cv::ocl::getLinearFilter_GPU(int /*srcType*/, int /*dstType*/, const Mat &kernel, const Size &ksize,
|
||||||
@ -770,7 +790,11 @@ static void GPUFilterBox(const oclMat &src, oclMat &dst,
|
|||||||
(src.rows == dst.rows));
|
(src.rows == dst.rows));
|
||||||
CV_Assert(src.oclchannels() == dst.oclchannels());
|
CV_Assert(src.oclchannels() == dst.oclchannels());
|
||||||
|
|
||||||
size_t BLOCK_SIZE = src.clCxt->getDeviceInfo().maxWorkItemSizes[0];
|
size_t tryWorkItems = src.clCxt->getDeviceInfo().maxWorkItemSizes[0];
|
||||||
|
do {
|
||||||
|
size_t BLOCK_SIZE = tryWorkItems;
|
||||||
|
while (BLOCK_SIZE > 32 && BLOCK_SIZE >= (size_t)ksize.width * 2 && BLOCK_SIZE > (size_t)src.cols * 2)
|
||||||
|
BLOCK_SIZE /= 2;
|
||||||
size_t BLOCK_SIZE_Y = 8; // TODO Check heuristic value on devices
|
size_t BLOCK_SIZE_Y = 8; // TODO Check heuristic value on devices
|
||||||
while (BLOCK_SIZE_Y < BLOCK_SIZE / 8 && BLOCK_SIZE_Y * src.clCxt->getDeviceInfo().maxComputeUnits * 32 < (size_t)src.rows)
|
while (BLOCK_SIZE_Y < BLOCK_SIZE / 8 && BLOCK_SIZE_Y * src.clCxt->getDeviceInfo().maxComputeUnits * 32 < (size_t)src.rows)
|
||||||
BLOCK_SIZE_Y *= 2;
|
BLOCK_SIZE_Y *= 2;
|
||||||
@ -868,8 +892,24 @@ static void GPUFilterBox(const oclMat &src, oclMat &dst,
|
|||||||
extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION",
|
extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION",
|
||||||
isIsolatedBorder ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED");
|
isIsolatedBorder ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED");
|
||||||
|
|
||||||
size_t gt[3] = {divUp(dst.cols, BLOCK_SIZE - (ksize.width - 1)) * BLOCK_SIZE, divUp(dst.rows, BLOCK_SIZE_Y), 1}, lt[3] = {BLOCK_SIZE, 1, 1};
|
size_t lt[3] = {BLOCK_SIZE, 1, 1};
|
||||||
openCLExecuteKernel(src.clCxt, &filtering_boxFilter, "boxFilter", gt, lt, args, -1, -1, build_options);
|
size_t gt[3] = {divUp(dst.cols, BLOCK_SIZE - (ksize.width - 1)) * BLOCK_SIZE, divUp(dst.rows, BLOCK_SIZE_Y), 1};
|
||||||
|
|
||||||
|
cl_kernel kernel = openCLGetKernelFromSource(src.clCxt, &filtering_boxFilter, "boxFilter", -1, -1, build_options);
|
||||||
|
|
||||||
|
size_t kernelWorkGroupSize;
|
||||||
|
openCLSafeCall(clGetKernelWorkGroupInfo(kernel, getClDeviceID(src.clCxt),
|
||||||
|
CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, 0));
|
||||||
|
if (lt[0] > kernelWorkGroupSize)
|
||||||
|
{
|
||||||
|
clReleaseKernel(kernel);
|
||||||
|
CV_Assert(BLOCK_SIZE > kernelWorkGroupSize);
|
||||||
|
tryWorkItems = kernelWorkGroupSize;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
openCLExecuteKernel(src.clCxt, kernel, gt, lt, args); // kernel will be released here
|
||||||
|
} while (false);
|
||||||
}
|
}
|
||||||
|
|
||||||
Ptr<BaseFilter_GPU> cv::ocl::getBoxFilter_GPU(int /*srcType*/, int /*dstType*/,
|
Ptr<BaseFilter_GPU> cv::ocl::getBoxFilter_GPU(int /*srcType*/, int /*dstType*/,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user