Merge pull request #1717 from alalek:ocl_adjust_worksize

2013-10-31 00:20:34 +04:00
parent 9751b3204d 7b0f018a74
commit 089cf423a0
3 changed files with 227 additions and 171 deletions
--- a/modules/ocl/include/opencv2/ocl/private/util.hpp
+++ b/modules/ocl/include/opencv2/ocl/private/util.hpp
@@ -103,7 +103,11 @@ CV_EXPORTS cl_kernel openCLGetKernelFromSource(const Context *clCxt,
        const cv::ocl::ProgramEntry* source, std::string kernelName);
 CV_EXPORTS cl_kernel openCLGetKernelFromSource(const Context *clCxt,
        const cv::ocl::ProgramEntry* source, std::string kernelName, const char *build_options);
 CV_EXPORTS cl_kernel openCLGetKernelFromSource(Context *ctx, const cv::ocl::ProgramEntry* source,
        string kernelName, int channels, int depth, const char *build_options);
 CV_EXPORTS void openCLVerifyKernel(const Context *clCxt, cl_kernel kernel, size_t *localThreads);
 CV_EXPORTS void openCLExecuteKernel(Context *ctx, cl_kernel kernel, size_t globalThreads[3],
                          size_t localThreads[3], std::vector< std::pair<size_t, const void *> > &args);
 CV_EXPORTS void openCLExecuteKernel(Context *clCxt , const cv::ocl::ProgramEntry* source, string kernelName, std::vector< std::pair<size_t, const void *> > &args,
        int globalcols , int globalrows, size_t blockSize = 16, int kernel_expand_depth = -1, int kernel_expand_channel = -1);
 CV_EXPORTS void openCLExecuteKernel_(Context *clCxt, const cv::ocl::ProgramEntry* source, std::string kernelName,
--- a/modules/ocl/src/cl_operations.cpp
+++ b/modules/ocl/src/cl_operations.cpp
@@ -336,8 +336,7 @@ static std::string removeDuplicatedWhiteSpaces(const char * buildOptions)
    return opt;
 }
-void openCLExecuteKernel_(Context *ctx, const cv::ocl::ProgramEntry* source, string kernelName, size_t globalThreads[3],
+cl_kernel openCLGetKernelFromSource(Context *ctx, const cv::ocl::ProgramEntry* source, string kernelName, int channels,
                          size_t localThreads[3],  vector< pair<size_t, const void *> > &args, int channels,
                          int depth, const char *build_options)
 {
    //construct kernel name
@@ -350,10 +349,14 @@ void openCLExecuteKernel_(Context *ctx, const cv::ocl::ProgramEntry* source, str
        idxStr << "_D" << depth;
    kernelName += idxStr.str();
    cl_kernel kernel;
    std::string fixedOptions = removeDuplicatedWhiteSpaces(build_options);
-    kernel = openCLGetKernelFromSource(ctx, source, kernelName, fixedOptions.c_str());
+    cl_kernel kernel = openCLGetKernelFromSource(ctx, source, kernelName, fixedOptions.c_str());
    return kernel;
 }
 void openCLExecuteKernel(Context *ctx, cl_kernel kernel, size_t globalThreads[3],
                          size_t localThreads[3],  vector< pair<size_t, const void *> > &args)
 {
    if ( localThreads != NULL)
    {
        globalThreads[0] = roundUp(globalThreads[0], localThreads[0]);
@@ -399,6 +402,15 @@ void openCLExecuteKernel_(Context *ctx, const cv::ocl::ProgramEntry* source, str
    openCLSafeCall(clReleaseKernel(kernel));
 }
 void openCLExecuteKernel_(Context *ctx, const cv::ocl::ProgramEntry* source, string kernelName, size_t globalThreads[3],
                          size_t localThreads[3],  vector< pair<size_t, const void *> > &args, int channels,
                          int depth, const char *build_options)
 {
    cl_kernel kernel = openCLGetKernelFromSource(ctx, source, kernelName, channels, depth, build_options);
    openCLExecuteKernel(ctx, kernel, globalThreads, localThreads, args);
 }
 void openCLExecuteKernel(Context *ctx, const cv::ocl::ProgramEntry* source, string kernelName,
                         size_t globalThreads[3], size_t localThreads[3],
                         vector< pair<size_t, const void *> > &args, int channels, int depth)
--- a/modules/ocl/src/filtering.cpp
+++ b/modules/ocl/src/filtering.cpp
@@ -578,104 +578,124 @@ static void GPUFilter2D(const oclMat &src, oclMat &dst, const Mat &kernel,
                kernelDataFloat.size()*sizeof(float), 1, clMemcpyHostToDevice);
    }
-    size_t BLOCK_SIZE = src.clCxt->getDeviceInfo().maxWorkItemSizes[0];
+    size_t tryWorkItems = src.clCxt->getDeviceInfo().maxWorkItemSizes[0];
    do {
        size_t BLOCK_SIZE = tryWorkItems;
        while (BLOCK_SIZE > 32 && BLOCK_SIZE >= (size_t)ksize.width * 2 && BLOCK_SIZE > (size_t)src.cols * 2)
            BLOCK_SIZE /= 2;
 #if 1 // TODO Mode with several blocks requires a much more VGPRs, so this optimization is not actual for the current devices
-    size_t BLOCK_SIZE_Y = 1;
+        size_t BLOCK_SIZE_Y = 1;
 #else
-    size_t BLOCK_SIZE_Y = 8; // TODO Check heuristic value on devices
+        size_t BLOCK_SIZE_Y = 8; // TODO Check heuristic value on devices
-    while (BLOCK_SIZE_Y < BLOCK_SIZE / 8 && BLOCK_SIZE_Y * src.clCxt->getDeviceInfo().maxComputeUnits * 32 < (size_t)src.rows)
+        while (BLOCK_SIZE_Y < BLOCK_SIZE / 8 && BLOCK_SIZE_Y * src.clCxt->getDeviceInfo().maxComputeUnits * 32 < (size_t)src.rows)
-        BLOCK_SIZE_Y *= 2;
+            BLOCK_SIZE_Y *= 2;
 #endif
-    CV_Assert((size_t)ksize.width <= BLOCK_SIZE);
+        CV_Assert((size_t)ksize.width <= BLOCK_SIZE);
-    bool isIsolatedBorder = (borderType & BORDER_ISOLATED) != 0;
+        bool isIsolatedBorder = (borderType & BORDER_ISOLATED) != 0;
-    vector<pair<size_t , const void *> > args;
+        vector<pair<size_t , const void *> > args;
-    args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data));
+        args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data));
-    cl_uint stepBytes = src.step;
+        cl_uint stepBytes = src.step;
-    args.push_back( make_pair( sizeof(cl_uint), (void *)&stepBytes));
+        args.push_back( make_pair( sizeof(cl_uint), (void *)&stepBytes));
-    int offsetXBytes = src.offset % src.step;
+        int offsetXBytes = src.offset % src.step;
-    int offsetX = offsetXBytes / src.elemSize();
+        int offsetX = offsetXBytes / src.elemSize();
-    CV_Assert((int)(offsetX * src.elemSize()) == offsetXBytes);
+        CV_Assert((int)(offsetX * src.elemSize()) == offsetXBytes);
-    int offsetY = src.offset / src.step;
+        int offsetY = src.offset / src.step;
-    int endX = (offsetX + src.cols);
+        int endX = (offsetX + src.cols);
-    int endY = (offsetY + src.rows);
+        int endY = (offsetY + src.rows);
-    cl_int rect[4] = {offsetX, offsetY, endX, endY};
+        cl_int rect[4] = {offsetX, offsetY, endX, endY};
-    if (!isIsolatedBorder)
+        if (!isIsolatedBorder)
-    {
+        {
-        rect[2] = src.wholecols;
+            rect[2] = src.wholecols;
-        rect[3] = src.wholerows;
+            rect[3] = src.wholerows;
-    }
+        }
-    args.push_back( make_pair( sizeof(cl_int)*4, (void *)&rect[0]));
+        args.push_back( make_pair( sizeof(cl_int)*4, (void *)&rect[0]));
-    args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data));
+        args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data));
-    cl_uint _stepBytes = dst.step;
+        cl_uint _stepBytes = dst.step;
-    args.push_back( make_pair( sizeof(cl_uint), (void *)&_stepBytes));
+        args.push_back( make_pair( sizeof(cl_uint), (void *)&_stepBytes));
-    int _offsetXBytes = dst.offset % dst.step;
+        int _offsetXBytes = dst.offset % dst.step;
-    int _offsetX = _offsetXBytes / dst.elemSize();
+        int _offsetX = _offsetXBytes / dst.elemSize();
-    CV_Assert((int)(_offsetX * dst.elemSize()) == _offsetXBytes);
+        CV_Assert((int)(_offsetX * dst.elemSize()) == _offsetXBytes);
-    int _offsetY = dst.offset / dst.step;
+        int _offsetY = dst.offset / dst.step;
-    int _endX = (_offsetX + dst.cols);
+        int _endX = (_offsetX + dst.cols);
-    int _endY = (_offsetY + dst.rows);
+        int _endY = (_offsetY + dst.rows);
-    cl_int _rect[4] = {_offsetX, _offsetY, _endX, _endY};
+        cl_int _rect[4] = {_offsetX, _offsetY, _endX, _endY};
-    args.push_back( make_pair( sizeof(cl_int)*4, (void *)&_rect[0]));
+        args.push_back( make_pair( sizeof(cl_int)*4, (void *)&_rect[0]));
-    float borderValue[4] = {0, 0, 0, 0}; // DON'T move into 'if' body
+        float borderValue[4] = {0, 0, 0, 0}; // DON'T move into 'if' body
-    double borderValueDouble[4] = {0, 0, 0, 0}; // DON'T move into 'if' body
+        double borderValueDouble[4] = {0, 0, 0, 0}; // DON'T move into 'if' body
-    if ((borderType & ~BORDER_ISOLATED) == BORDER_CONSTANT)
+        if ((borderType & ~BORDER_ISOLATED) == BORDER_CONSTANT)
-    {
+        {
-        if (useDouble)
+            if (useDouble)
-            args.push_back( make_pair( sizeof(double) * src.oclchannels(), (void *)&borderValue[0]));
+                args.push_back( make_pair( sizeof(double) * src.oclchannels(), (void *)&borderValue[0]));
-        else
+            else
-            args.push_back( make_pair( sizeof(float) * src.oclchannels(), (void *)&borderValueDouble[0]));
+                args.push_back( make_pair( sizeof(float) * src.oclchannels(), (void *)&borderValueDouble[0]));
-    }
+        }
-    args.push_back( make_pair( sizeof(cl_mem), (void *)&oclKernelParameter.data));
+        args.push_back( make_pair( sizeof(cl_mem), (void *)&oclKernelParameter.data));
-    const char* btype = NULL;
+        const char* btype = NULL;
-    switch (borderType & ~BORDER_ISOLATED)
+        switch (borderType & ~BORDER_ISOLATED)
-    {
+        {
-    case BORDER_CONSTANT:
+        case BORDER_CONSTANT:
-        btype = "BORDER_CONSTANT";
+            btype = "BORDER_CONSTANT";
-        break;
+            break;
-    case BORDER_REPLICATE:
+        case BORDER_REPLICATE:
-        btype = "BORDER_REPLICATE";
+            btype = "BORDER_REPLICATE";
-        break;
+            break;
-    case BORDER_REFLECT:
+        case BORDER_REFLECT:
-        btype = "BORDER_REFLECT";
+            btype = "BORDER_REFLECT";
-        break;
+            break;
-    case BORDER_WRAP:
+        case BORDER_WRAP:
-        CV_Error(CV_StsUnsupportedFormat, "BORDER_WRAP is not supported!");
+            CV_Error(CV_StsUnsupportedFormat, "BORDER_WRAP is not supported!");
-        return;
+            return;
-    case BORDER_REFLECT101:
+        case BORDER_REFLECT101:
-        btype = "BORDER_REFLECT_101";
+            btype = "BORDER_REFLECT_101";
-        break;
+            break;
-    }
+        }
-    int requiredTop = anchor.y;
+        int requiredTop = anchor.y;
-    int requiredLeft = BLOCK_SIZE; // not this: anchor.x;
+        int requiredLeft = BLOCK_SIZE; // not this: anchor.x;
-    int requiredBottom = ksize.height - 1 - anchor.y;
+        int requiredBottom = ksize.height - 1 - anchor.y;
-    int requiredRight = BLOCK_SIZE; // not this: ksize.width - 1 - anchor.x;
+        int requiredRight = BLOCK_SIZE; // not this: ksize.width - 1 - anchor.x;
-    int h = isIsolatedBorder ? src.rows : src.wholerows;
+        int h = isIsolatedBorder ? src.rows : src.wholerows;
-    int w = isIsolatedBorder ? src.cols : src.wholecols;
+        int w = isIsolatedBorder ? src.cols : src.wholecols;
-    bool extra_extrapolation = h < requiredTop || h < requiredBottom || w < requiredLeft || w < requiredRight;
+        bool extra_extrapolation = h < requiredTop || h < requiredBottom || w < requiredLeft || w < requiredRight;
-    char build_options[1024];
+        char build_options[1024];
-    sprintf(build_options, "-D LOCAL_SIZE=%d -D BLOCK_SIZE_Y=%d -D DATA_DEPTH=%d -D DATA_CHAN=%d -D USE_DOUBLE=%d "
+        sprintf(build_options, "-D LOCAL_SIZE=%d -D BLOCK_SIZE_Y=%d -D DATA_DEPTH=%d -D DATA_CHAN=%d -D USE_DOUBLE=%d "
-            "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D KERNEL_SIZE_Y2_ALIGNED=%d "
+                "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D KERNEL_SIZE_Y2_ALIGNED=%d "
-            "-D %s -D %s -D %s",
+                "-D %s -D %s -D %s",
-            (int)BLOCK_SIZE, (int)BLOCK_SIZE_Y,
+                (int)BLOCK_SIZE, (int)BLOCK_SIZE_Y,
-            src.depth(), src.oclchannels(), useDouble ? 1 : 0,
+                src.depth(), src.oclchannels(), useDouble ? 1 : 0,
-            anchor.x, anchor.y, ksize.width, ksize.height, kernel_size_y2_aligned,
+                anchor.x, anchor.y, ksize.width, ksize.height, kernel_size_y2_aligned,
-            btype,
+                btype,
-            extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION",
+                extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION",
-            isIsolatedBorder ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED");
+                isIsolatedBorder ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED");
-    size_t gt[3] = {divUp(dst.cols, BLOCK_SIZE - (ksize.width - 1)) * BLOCK_SIZE, divUp(dst.rows, BLOCK_SIZE_Y), 1}, lt[3] = {BLOCK_SIZE, 1, 1};
+        size_t lt[3] = {BLOCK_SIZE, 1, 1};
-    openCLExecuteKernel(src.clCxt, &filtering_filter2D, "filter2D", gt, lt, args, -1, -1, build_options);
+        size_t gt[3] = {divUp(dst.cols, BLOCK_SIZE - (ksize.width - 1)) * BLOCK_SIZE, divUp(dst.rows, BLOCK_SIZE_Y), 1};
        cl_kernel kernel = openCLGetKernelFromSource(src.clCxt, &filtering_filter2D, "filter2D", -1, -1, build_options);
        size_t kernelWorkGroupSize;
        openCLSafeCall(clGetKernelWorkGroupInfo(kernel, getClDeviceID(src.clCxt),
                                                CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, 0));
        if (lt[0] > kernelWorkGroupSize)
        {
            clReleaseKernel(kernel);
            CV_Assert(BLOCK_SIZE > kernelWorkGroupSize);
            tryWorkItems = kernelWorkGroupSize;
            continue;
        }
        openCLExecuteKernel(src.clCxt, kernel, gt, lt, args); // kernel will be released here
    } while (false);
 }
 Ptr<BaseFilter_GPU> cv::ocl::getLinearFilter_GPU(int /*srcType*/, int /*dstType*/, const Mat &kernel, const Size &ksize,
@@ -770,106 +790,126 @@ static void GPUFilterBox(const oclMat &src, oclMat &dst,
              (src.rows == dst.rows));
    CV_Assert(src.oclchannels() == dst.oclchannels());
-    size_t BLOCK_SIZE = src.clCxt->getDeviceInfo().maxWorkItemSizes[0];
+    size_t tryWorkItems = src.clCxt->getDeviceInfo().maxWorkItemSizes[0];
-    size_t BLOCK_SIZE_Y = 8; // TODO Check heuristic value on devices
+    do {
-    while (BLOCK_SIZE_Y < BLOCK_SIZE / 8 && BLOCK_SIZE_Y * src.clCxt->getDeviceInfo().maxComputeUnits * 32 < (size_t)src.rows)
+        size_t BLOCK_SIZE = tryWorkItems;
-        BLOCK_SIZE_Y *= 2;
+        while (BLOCK_SIZE > 32 && BLOCK_SIZE >= (size_t)ksize.width * 2 && BLOCK_SIZE > (size_t)src.cols * 2)
            BLOCK_SIZE /= 2;
        size_t BLOCK_SIZE_Y = 8; // TODO Check heuristic value on devices
        while (BLOCK_SIZE_Y < BLOCK_SIZE / 8 && BLOCK_SIZE_Y * src.clCxt->getDeviceInfo().maxComputeUnits * 32 < (size_t)src.rows)
            BLOCK_SIZE_Y *= 2;
-    CV_Assert((size_t)ksize.width <= BLOCK_SIZE);
+        CV_Assert((size_t)ksize.width <= BLOCK_SIZE);
-    bool isIsolatedBorder = (borderType & BORDER_ISOLATED) != 0;
+        bool isIsolatedBorder = (borderType & BORDER_ISOLATED) != 0;
-    vector<pair<size_t , const void *> > args;
+        vector<pair<size_t , const void *> > args;
-    args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data));
+        args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data));
-    cl_uint stepBytes = src.step;
+        cl_uint stepBytes = src.step;
-    args.push_back( make_pair( sizeof(cl_uint), (void *)&stepBytes));
+        args.push_back( make_pair( sizeof(cl_uint), (void *)&stepBytes));
-    int offsetXBytes = src.offset % src.step;
+        int offsetXBytes = src.offset % src.step;
-    int offsetX = offsetXBytes / src.elemSize();
+        int offsetX = offsetXBytes / src.elemSize();
-    CV_Assert((int)(offsetX * src.elemSize()) == offsetXBytes);
+        CV_Assert((int)(offsetX * src.elemSize()) == offsetXBytes);
-    int offsetY = src.offset / src.step;
+        int offsetY = src.offset / src.step;
-    int endX = (offsetX + src.cols);
+        int endX = (offsetX + src.cols);
-    int endY = (offsetY + src.rows);
+        int endY = (offsetY + src.rows);
-    cl_int rect[4] = {offsetX, offsetY, endX, endY};
+        cl_int rect[4] = {offsetX, offsetY, endX, endY};
-    if (!isIsolatedBorder)
+        if (!isIsolatedBorder)
-    {
+        {
-        rect[2] = src.wholecols;
+            rect[2] = src.wholecols;
-        rect[3] = src.wholerows;
+            rect[3] = src.wholerows;
-    }
+        }
-    args.push_back( make_pair( sizeof(cl_int)*4, (void *)&rect[0]));
+        args.push_back( make_pair( sizeof(cl_int)*4, (void *)&rect[0]));
-    args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data));
+        args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data));
-    cl_uint _stepBytes = dst.step;
+        cl_uint _stepBytes = dst.step;
-    args.push_back( make_pair( sizeof(cl_uint), (void *)&_stepBytes));
+        args.push_back( make_pair( sizeof(cl_uint), (void *)&_stepBytes));
-    int _offsetXBytes = dst.offset % dst.step;
+        int _offsetXBytes = dst.offset % dst.step;
-    int _offsetX = _offsetXBytes / dst.elemSize();
+        int _offsetX = _offsetXBytes / dst.elemSize();
-    CV_Assert((int)(_offsetX * dst.elemSize()) == _offsetXBytes);
+        CV_Assert((int)(_offsetX * dst.elemSize()) == _offsetXBytes);
-    int _offsetY = dst.offset / dst.step;
+        int _offsetY = dst.offset / dst.step;
-    int _endX = (_offsetX + dst.cols);
+        int _endX = (_offsetX + dst.cols);
-    int _endY = (_offsetY + dst.rows);
+        int _endY = (_offsetY + dst.rows);
-    cl_int _rect[4] = {_offsetX, _offsetY, _endX, _endY};
+        cl_int _rect[4] = {_offsetX, _offsetY, _endX, _endY};
-    args.push_back( make_pair( sizeof(cl_int)*4, (void *)&_rect[0]));
+        args.push_back( make_pair( sizeof(cl_int)*4, (void *)&_rect[0]));
-    bool useDouble = src.depth() == CV_64F;
+        bool useDouble = src.depth() == CV_64F;
-    float borderValue[4] = {0, 0, 0, 0}; // DON'T move into 'if' body
+        float borderValue[4] = {0, 0, 0, 0}; // DON'T move into 'if' body
-    double borderValueDouble[4] = {0, 0, 0, 0}; // DON'T move into 'if' body
+        double borderValueDouble[4] = {0, 0, 0, 0}; // DON'T move into 'if' body
-    if ((borderType & ~BORDER_ISOLATED) == BORDER_CONSTANT)
+        if ((borderType & ~BORDER_ISOLATED) == BORDER_CONSTANT)
-    {
+        {
            if (useDouble)
                args.push_back( make_pair( sizeof(double) * src.oclchannels(), (void *)&borderValue[0]));
            else
                args.push_back( make_pair( sizeof(float) * src.oclchannels(), (void *)&borderValueDouble[0]));
        }
        double alphaDouble = alpha; // DON'T move into 'if' body
        if (useDouble)
-            args.push_back( make_pair( sizeof(double) * src.oclchannels(), (void *)&borderValue[0]));
+            args.push_back( make_pair( sizeof(double), (void *)&alphaDouble));
        else
-            args.push_back( make_pair( sizeof(float) * src.oclchannels(), (void *)&borderValueDouble[0]));
+            args.push_back( make_pair( sizeof(float), (void *)&alpha));
    }
-    double alphaDouble = alpha; // DON'T move into 'if' body
+        const char* btype = NULL;
    if (useDouble)
        args.push_back( make_pair( sizeof(double), (void *)&alphaDouble));
    else
        args.push_back( make_pair( sizeof(float), (void *)&alpha));
-    const char* btype = NULL;
+        switch (borderType & ~BORDER_ISOLATED)
        {
        case BORDER_CONSTANT:
            btype = "BORDER_CONSTANT";
            break;
        case BORDER_REPLICATE:
            btype = "BORDER_REPLICATE";
            break;
        case BORDER_REFLECT:
            btype = "BORDER_REFLECT";
            break;
        case BORDER_WRAP:
            CV_Error(CV_StsUnsupportedFormat, "BORDER_WRAP is not supported!");
            return;
        case BORDER_REFLECT101:
            btype = "BORDER_REFLECT_101";
            break;
        }
-    switch (borderType & ~BORDER_ISOLATED)
+        int requiredTop = anchor.y;
-    {
+        int requiredLeft = BLOCK_SIZE; // not this: anchor.x;
-    case BORDER_CONSTANT:
+        int requiredBottom = ksize.height - 1 - anchor.y;
-        btype = "BORDER_CONSTANT";
+        int requiredRight = BLOCK_SIZE; // not this: ksize.width - 1 - anchor.x;
-        break;
+        int h = isIsolatedBorder ? src.rows : src.wholerows;
-    case BORDER_REPLICATE:
+        int w = isIsolatedBorder ? src.cols : src.wholecols;
-        btype = "BORDER_REPLICATE";
+        bool extra_extrapolation = h < requiredTop || h < requiredBottom || w < requiredLeft || w < requiredRight;
        break;
    case BORDER_REFLECT:
        btype = "BORDER_REFLECT";
        break;
    case BORDER_WRAP:
        CV_Error(CV_StsUnsupportedFormat, "BORDER_WRAP is not supported!");
        return;
    case BORDER_REFLECT101:
        btype = "BORDER_REFLECT_101";
        break;
    }
-    int requiredTop = anchor.y;
+        CV_Assert(w >= ksize.width && h >= ksize.height); // TODO Other cases are not tested well
    int requiredLeft = BLOCK_SIZE; // not this: anchor.x;
    int requiredBottom = ksize.height - 1 - anchor.y;
    int requiredRight = BLOCK_SIZE; // not this: ksize.width - 1 - anchor.x;
    int h = isIsolatedBorder ? src.rows : src.wholerows;
    int w = isIsolatedBorder ? src.cols : src.wholecols;
    bool extra_extrapolation = h < requiredTop || h < requiredBottom || w < requiredLeft || w < requiredRight;
-    CV_Assert(w >= ksize.width && h >= ksize.height); // TODO Other cases are not tested well
+        char build_options[1024];
        sprintf(build_options, "-D LOCAL_SIZE=%d -D BLOCK_SIZE_Y=%d -D DATA_DEPTH=%d -D DATA_CHAN=%d -D USE_DOUBLE=%d -D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D %s -D %s -D %s",
                (int)BLOCK_SIZE, (int)BLOCK_SIZE_Y,
                src.depth(), src.oclchannels(), useDouble ? 1 : 0,
                anchor.x, anchor.y, ksize.width, ksize.height,
                btype,
                extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION",
                isIsolatedBorder ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED");
-    char build_options[1024];
+        size_t lt[3] = {BLOCK_SIZE, 1, 1};
-    sprintf(build_options, "-D LOCAL_SIZE=%d -D BLOCK_SIZE_Y=%d -D DATA_DEPTH=%d -D DATA_CHAN=%d -D USE_DOUBLE=%d -D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D %s -D %s -D %s",
+        size_t gt[3] = {divUp(dst.cols, BLOCK_SIZE - (ksize.width - 1)) * BLOCK_SIZE, divUp(dst.rows, BLOCK_SIZE_Y), 1};
            (int)BLOCK_SIZE, (int)BLOCK_SIZE_Y,
            src.depth(), src.oclchannels(), useDouble ? 1 : 0,
            anchor.x, anchor.y, ksize.width, ksize.height,
            btype,
            extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION",
            isIsolatedBorder ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED");
-    size_t gt[3] = {divUp(dst.cols, BLOCK_SIZE - (ksize.width - 1)) * BLOCK_SIZE, divUp(dst.rows, BLOCK_SIZE_Y), 1}, lt[3] = {BLOCK_SIZE, 1, 1};
+        cl_kernel kernel = openCLGetKernelFromSource(src.clCxt, &filtering_boxFilter, "boxFilter", -1, -1, build_options);
-    openCLExecuteKernel(src.clCxt, &filtering_boxFilter, "boxFilter", gt, lt, args, -1, -1, build_options);
+
        size_t kernelWorkGroupSize;
        openCLSafeCall(clGetKernelWorkGroupInfo(kernel, getClDeviceID(src.clCxt),
                                                CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, 0));
        if (lt[0] > kernelWorkGroupSize)
        {
            clReleaseKernel(kernel);
            CV_Assert(BLOCK_SIZE > kernelWorkGroupSize);
            tryWorkItems = kernelWorkGroupSize;
            continue;
        }
        openCLExecuteKernel(src.clCxt, kernel, gt, lt, args); // kernel will be released here
    } while (false);
 }
 Ptr<BaseFilter_GPU> cv::ocl::getBoxFilter_GPU(int /*srcType*/, int /*dstType*/,