From e05112a36482ffef44243dc69ed9a047162a2d80 Mon Sep 17 00:00:00 2001
From: yao <bitwangyaoyao@gmail.com>
Date: Thu, 17 Jan 2013 09:34:43 +0800
Subject: [PATCH] some host side optimizations to ocl::GaussianBlur

---
 modules/ocl/src/filtering.cpp | 345 +++++++++++++++++++++-------------
 1 file changed, 215 insertions(+), 130 deletions(-)
diff --git a/modules/ocl/src/filtering.cpp b/modules/ocl/src/filtering.cpp
index d0500059f..18fed6e8c 100644
--- a/modules/ocl/src/filtering.cpp
+++ b/modules/ocl/src/filtering.cpp
@@ -47,6 +47,7 @@
 //M*/
 
 #include "precomp.hpp"
+#include "mcwutil.hpp"
 #include <iostream>
 using namespace std;
 using namespace cv;
@@ -109,7 +110,7 @@ Ptr<FilterEngine_GPU> cv::ocl::createLinearFilter_GPU(int, int, const Mat &, con
     return Ptr<FilterEngine_GPU>(0);
 }
 
-Ptr<FilterEngine_GPU> cv::ocl::createDerivFilter_GPU( int srcType, int dstType, int dx, int dy, int ksize, int borderType )
+Ptr<FilterEngine_GPU> cv::ocl::createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, int borderType)
 {
     throw_nogpu();
     return Ptr<FilterEngine_GPU>(0);
@@ -150,17 +151,17 @@ void cv::ocl::Laplacian(const oclMat &, oclMat &, int, int, double)
     throw_nogpu();
 }
 
-void cv::ocl::erode( const oclMat &, oclMat &, const Mat &, Point, int)
+void cv::ocl::erode(const oclMat &, oclMat &, const Mat &, Point, int)
 {
     throw_nogpu();
 }
 
-void cv::ocl::dilate( const oclMat &, oclMat &, const Mat &, Point, int)
+void cv::ocl::dilate(const oclMat &, oclMat &, const Mat &, Point, int)
 {
     throw_nogpu();
 }
 
-void cv::ocl::morphologyEx( const oclMat &, oclMat &, int, const Mat &, Point, int)
+void cv::ocl::morphologyEx(const oclMat &, oclMat &, int, const Mat &, Point, int)
 {
     throw_nogpu();
 }
@@ -194,7 +195,9 @@ namespace
 inline void normalizeAnchor(int &anchor, int ksize)
 {
     if (anchor < 0)
+    {
         anchor = ksize >> 1;
+    }
 
     CV_Assert(0 <= anchor && anchor < ksize);
 }
@@ -208,7 +211,10 @@ inline void normalizeAnchor(Point &anchor, const Size &ksize)
 inline void normalizeROI(Rect &roi, const Size &ksize, const Point &anchor, const Size &src_size)
 {
     if (roi == Rect(0, 0, -1, -1))
+    {
         roi = Rect(0, 0, src_size.width, src_size.height);
+    }
+
     CV_Assert(ksize.height > 0 && ksize.width > 0 && ((ksize.height & 1) == 1) && ((ksize.width & 1) == 1));
     CV_Assert((anchor.x == -1 && anchor.y == -1) || (anchor.x == ksize.width >> 1 && anchor.y == ksize.height >> 1));
     CV_Assert(roi.x >= 0 && roi.y >= 0 && roi.width <= src_size.width && roi.height <= src_size.height);
@@ -218,7 +224,11 @@ inline void normalizeROI(Rect &roi, const Size &ksize, const Point &anchor, cons
 inline void normalizeKernel(const Mat &kernel, oclMat &gpu_krnl, int type = CV_8U, int *nDivisor = 0, bool reverse = false)
 {
     int scale = nDivisor && (kernel.depth() == CV_32F || kernel.depth() == CV_64F) ? 256 : 1;
-    if (nDivisor) *nDivisor = scale;
+
+    if (nDivisor)
+    {
+        *nDivisor = scale;
+    }
 
     Mat temp(kernel.size(), type);
     kernel.convertTo(temp, type, scale);
@@ -227,6 +237,7 @@ inline void normalizeKernel(const Mat &kernel, oclMat &gpu_krnl, int type = CV_8
     if (reverse)
     {
         int count = cont_krnl.cols >> 1;
+
         for (int i = 0; i < count; ++i)
         {
             std::swap(cont_krnl.at<int>(0, i), cont_krnl.at<int>(0, cont_krnl.cols - 1 - i));
@@ -326,9 +337,9 @@ void GPUErode(const oclMat &src, oclMat &dst, oclMat &mat_kernel, Size &ksize, c
     //Normalize the result by default
     //float alpha = ksize.height * ksize.width;
     CV_Assert(src.clCxt == dst.clCxt);
-    CV_Assert( (src.cols == dst.cols) &&
-               (src.rows == dst.rows) );
-    CV_Assert( (src.oclchannels() == dst.oclchannels()) );
+    CV_Assert((src.cols == dst.cols) &&
+              (src.rows == dst.rows));
+    CV_Assert((src.oclchannels() == dst.oclchannels()));
 
     int srcStep = src.step1() / src.oclchannels();
     int dstStep = dst.step1() / dst.oclchannels();
@@ -342,19 +353,21 @@ void GPUErode(const oclMat &src, oclMat &dst, oclMat &mat_kernel, Size &ksize, c
     size_t localThreads[3] = {16, 16, 1};
     size_t globalThreads[3] = {(src.cols + localThreads[0] - 1) / localThreads[0] *localThreads[0], (src.rows + localThreads[1] - 1) / localThreads[1] *localThreads[1], 1};
 
-    if(src.type() == CV_8UC1)
+    if (src.type() == CV_8UC1)
     {
         kernelName = "morph_C1_D0";
         globalThreads[0] = ((src.cols + 3) / 4 + localThreads[0] - 1) / localThreads[0] * localThreads[0];
-        CV_Assert( localThreads[0]*localThreads[1] * 8 >= (localThreads[0] * 4 + ksize.width - 1) * (localThreads[1] + ksize.height - 1) );
+        CV_Assert(localThreads[0]*localThreads[1] * 8 >= (localThreads[0] * 4 + ksize.width - 1) * (localThreads[1] + ksize.height - 1));
     }
     else
     {
         kernelName = "morph";
-        CV_Assert( localThreads[0]*localThreads[1] * 2 >= (localThreads[0] + ksize.width - 1) * (localThreads[1] + ksize.height - 1) );
+        CV_Assert(localThreads[0]*localThreads[1] * 2 >= (localThreads[0] + ksize.width - 1) * (localThreads[1] + ksize.height - 1));
     }
+
     char s[64];
-    switch(src.type())
+
+    switch (src.type())
     {
     case CV_8UC1:
         sprintf(s, "-D VAL=255");
@@ -373,21 +386,22 @@ void GPUErode(const oclMat &src, oclMat &dst, oclMat &mat_kernel, Size &ksize, c
     default:
         CV_Error(CV_StsUnsupportedFormat, "unsupported type");
     }
+
     char compile_option[128];
     sprintf(compile_option, "-D RADIUSX=%d -D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D ERODE %s", anchor.x, anchor.y, localThreads[0], localThreads[1], s);
     vector< pair<size_t, const void *> > args;
-    args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data));
-    args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&srcOffset_x));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&srcOffset_y));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src.cols));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&srcStep));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&dstStep));
-    args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_kernel.data));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src.wholecols));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src.wholerows));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&dstOffset));
+    args.push_back(make_pair(sizeof(cl_mem), (void *)&src.data));
+    args.push_back(make_pair(sizeof(cl_mem), (void *)&dst.data));
+    args.push_back(make_pair(sizeof(cl_int), (void *)&srcOffset_x));
+    args.push_back(make_pair(sizeof(cl_int), (void *)&srcOffset_y));
+    args.push_back(make_pair(sizeof(cl_int), (void *)&src.cols));
+    args.push_back(make_pair(sizeof(cl_int), (void *)&src.rows));
+    args.push_back(make_pair(sizeof(cl_int), (void *)&srcStep));
+    args.push_back(make_pair(sizeof(cl_int), (void *)&dstStep));
+    args.push_back(make_pair(sizeof(cl_mem), (void *)&mat_kernel.data));
+    args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholecols));
+    args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholerows));
+    args.push_back(make_pair(sizeof(cl_int), (void *)&dstOffset));
     openCLExecuteKernel(clCxt, &filtering_morph, kernelName, globalThreads, localThreads, args, -1, -1, compile_option);
 }
 
@@ -398,9 +412,9 @@ void GPUDilate(const oclMat &src, oclMat &dst, oclMat &mat_kernel, Size &ksize,
     //Normalize the result by default
     //float alpha = ksize.height * ksize.width;
     CV_Assert(src.clCxt == dst.clCxt);
-    CV_Assert( (src.cols == dst.cols) &&
-               (src.rows == dst.rows) );
-    CV_Assert( (src.oclchannels() == dst.oclchannels()) );
+    CV_Assert((src.cols == dst.cols) &&
+              (src.rows == dst.rows));
+    CV_Assert((src.oclchannels() == dst.oclchannels()));
 
     int srcStep = src.step1() / src.oclchannels();
     int dstStep = dst.step1() / dst.oclchannels();
@@ -414,19 +428,21 @@ void GPUDilate(const oclMat &src, oclMat &dst, oclMat &mat_kernel, Size &ksize,
     size_t localThreads[3] = {16, 16, 1};
     size_t globalThreads[3] = {(src.cols + localThreads[0]) / localThreads[0] *localThreads[0], (src.rows + localThreads[1]) / localThreads[1] *localThreads[1], 1};
 
-    if(src.type() == CV_8UC1)
+    if (src.type() == CV_8UC1)
     {
         kernelName = "morph_C1_D0";
         globalThreads[0] = ((src.cols + 3) / 4 + localThreads[0]) / localThreads[0] * localThreads[0];
-        CV_Assert( localThreads[0]*localThreads[1] * 8 >= (localThreads[0] * 4 + ksize.width - 1) * (localThreads[1] + ksize.height - 1) );
+        CV_Assert(localThreads[0]*localThreads[1] * 8 >= (localThreads[0] * 4 + ksize.width - 1) * (localThreads[1] + ksize.height - 1));
     }
     else
     {
         kernelName = "morph";
-        CV_Assert( localThreads[0]*localThreads[1] * 2 >= (localThreads[0] + ksize.width - 1) * (localThreads[1] + ksize.height - 1) );
+        CV_Assert(localThreads[0]*localThreads[1] * 2 >= (localThreads[0] + ksize.width - 1) * (localThreads[1] + ksize.height - 1));
     }
+
     char s[64];
-    switch(src.type())
+
+    switch (src.type())
     {
     case CV_8UC1:
         sprintf(s, "-D VAL=0");
@@ -445,21 +461,22 @@ void GPUDilate(const oclMat &src, oclMat &dst, oclMat &mat_kernel, Size &ksize,
     default:
         CV_Error(CV_StsUnsupportedFormat, "unsupported type");
     }
+
     char compile_option[128];
     sprintf(compile_option, "-D RADIUSX=%d -D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D DILATE %s", anchor.x, anchor.y, localThreads[0], localThreads[1], s);
     vector< pair<size_t, const void *> > args;
-    args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data));
-    args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&srcOffset_x));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&srcOffset_y));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src.cols));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&srcStep));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&dstStep));
-    args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_kernel.data));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src.wholecols));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src.wholerows));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&dstOffset));
+    args.push_back(make_pair(sizeof(cl_mem), (void *)&src.data));
+    args.push_back(make_pair(sizeof(cl_mem), (void *)&dst.data));
+    args.push_back(make_pair(sizeof(cl_int), (void *)&srcOffset_x));
+    args.push_back(make_pair(sizeof(cl_int), (void *)&srcOffset_y));
+    args.push_back(make_pair(sizeof(cl_int), (void *)&src.cols));
+    args.push_back(make_pair(sizeof(cl_int), (void *)&src.rows));
+    args.push_back(make_pair(sizeof(cl_int), (void *)&srcStep));
+    args.push_back(make_pair(sizeof(cl_int), (void *)&dstStep));
+    args.push_back(make_pair(sizeof(cl_mem), (void *)&mat_kernel.data));
+    args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholecols));
+    args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholerows));
+    args.push_back(make_pair(sizeof(cl_int), (void *)&dstOffset));
     openCLExecuteKernel(clCxt, &filtering_morph, kernelName, globalThreads, localThreads, args, -1, -1, compile_option);
 }
 
@@ -492,6 +509,7 @@ public:
     virtual void apply(const oclMat &src, oclMat &dst)
     {
         Filter2DEngine_GPU::apply(src, dst);
+
         //if (iters > 1)
         //{
         // Size wholesize;
@@ -506,7 +524,7 @@ public:
         // //Filter2DEngine_GPU::apply(dst, morfBuf);
         // //morfBuf.copyTo(dst);
         //}
-        for(int i = 1; i < iters; ++i)
+        for (int i = 1; i < iters; ++i)
         {
             //dst.swap(morfBuf);
             Size wholesize;
@@ -541,16 +559,17 @@ namespace
 {
 void morphOp(int op, const oclMat &src, oclMat &dst, const Mat &_kernel, Point anchor, int iterations, int borderType, const Scalar &borderValue)
 {
-    if((borderType != cv::BORDER_CONSTANT) || (borderValue != morphologyDefaultBorderValue()))
+    if ((borderType != cv::BORDER_CONSTANT) || (borderValue != morphologyDefaultBorderValue()))
     {
         CV_Error(CV_StsBadArg, "unsupported border type");
     }
+
     Mat kernel;
     Size ksize = _kernel.data ? _kernel.size() : Size(3, 3);
 
     normalizeAnchor(anchor, ksize);
 
-    if (iterations == 0 || _kernel.rows * _kernel.cols == 1)
+    if (iterations == 0 || _kernel.rows *_kernel.cols == 1)
     {
         src.copyTo(dst);
         return;
@@ -572,7 +591,9 @@ void morphOp(int op, const oclMat &src, oclMat &dst, const Mat &_kernel, Point a
         iterations = 1;
     }
     else
+    {
         kernel = _kernel;
+    }
 
     Ptr<FilterEngine_GPU> f = createMorphologyFilter_GPU(op, src.type(), kernel, anchor, iterations);
 
@@ -580,63 +601,69 @@ void morphOp(int op, const oclMat &src, oclMat &dst, const Mat &_kernel, Point a
 }
 }
 
-void cv::ocl::erode( const oclMat &src, oclMat &dst, const Mat &kernel, Point anchor, int iterations,
-                     int borderType, const Scalar &borderValue)
+void cv::ocl::erode(const oclMat &src, oclMat &dst, const Mat &kernel, Point anchor, int iterations,
+                    int borderType, const Scalar &borderValue)
 {
     bool allZero = true;
-    for(int i = 0; i < kernel.rows * kernel.cols; ++i)
-        if(kernel.data[i] != 0)
+
+    for (int i = 0; i < kernel.rows * kernel.cols; ++i)
+        if (kernel.data[i] != 0)
+        {
             allZero = false;
-    if(allZero)
+        }
+
+    if (allZero)
     {
         kernel.data[0] = 1;
     }
+
     morphOp(MORPH_ERODE, src, dst, kernel, anchor, iterations, borderType, borderValue);
 }
 
-void cv::ocl::dilate( const oclMat &src, oclMat &dst, const Mat &kernel, Point anchor, int iterations,
-                      int borderType, const Scalar &borderValue)
+void cv::ocl::dilate(const oclMat &src, oclMat &dst, const Mat &kernel, Point anchor, int iterations,
+                     int borderType, const Scalar &borderValue)
 {
     morphOp(MORPH_DILATE, src, dst, kernel, anchor, iterations, borderType, borderValue);
 }
 
-void cv::ocl::morphologyEx( const oclMat &src, oclMat &dst, int op, const Mat &kernel, Point anchor, int iterations,
-                            int borderType, const Scalar &borderValue)
+void cv::ocl::morphologyEx(const oclMat &src, oclMat &dst, int op, const Mat &kernel, Point anchor, int iterations,
+                           int borderType, const Scalar &borderValue)
 {
     oclMat temp;
-    switch( op )
+
+    switch (op)
     {
     case MORPH_ERODE:
-        erode( src, dst, kernel, anchor, iterations, borderType, borderValue);
+        erode(src, dst, kernel, anchor, iterations, borderType, borderValue);
         break;
     case MORPH_DILATE:
-        dilate( src, dst, kernel, anchor, iterations, borderType, borderValue);
+        dilate(src, dst, kernel, anchor, iterations, borderType, borderValue);
         break;
     case MORPH_OPEN:
-        erode( src, temp, kernel, anchor, iterations, borderType, borderValue);
-        dilate( temp, dst, kernel, anchor, iterations, borderType, borderValue);
+        erode(src, temp, kernel, anchor, iterations, borderType, borderValue);
+        dilate(temp, dst, kernel, anchor, iterations, borderType, borderValue);
         break;
     case CV_MOP_CLOSE:
-        dilate( src, temp, kernel, anchor, iterations, borderType, borderValue);
-        erode( temp, dst, kernel, anchor, iterations, borderType, borderValue);
+        dilate(src, temp, kernel, anchor, iterations, borderType, borderValue);
+        erode(temp, dst, kernel, anchor, iterations, borderType, borderValue);
         break;
     case CV_MOP_GRADIENT:
-        erode( src, temp, kernel, anchor, iterations, borderType, borderValue);
-        dilate( src, dst, kernel, anchor, iterations, borderType, borderValue);
+        erode(src, temp, kernel, anchor, iterations, borderType, borderValue);
+        dilate(src, dst, kernel, anchor, iterations, borderType, borderValue);
         subtract(dst, temp, dst);
         break;
     case CV_MOP_TOPHAT:
-        erode( src, dst, kernel, anchor, iterations, borderType, borderValue);
-        dilate( dst, temp, kernel, anchor, iterations, borderType, borderValue);
+        erode(src, dst, kernel, anchor, iterations, borderType, borderValue);
+        dilate(dst, temp, kernel, anchor, iterations, borderType, borderValue);
         subtract(src, temp, dst);
         break;
     case CV_MOP_BLACKHAT:
-        dilate( src, dst, kernel, anchor, iterations, borderType, borderValue);
-        erode( dst, temp, kernel, anchor, iterations, borderType, borderValue);
+        dilate(src, dst, kernel, anchor, iterations, borderType, borderValue);
+        erode(dst, temp, kernel, anchor, iterations, borderType, borderValue);
         subtract(temp, src, dst);
         break;
     default:
-        CV_Error( CV_StsBadArg, "unknown morphological operation" );
+        CV_Error(CV_StsBadArg, "unknown morphological operation");
     }
 }
 
@@ -668,10 +695,10 @@ void GPUFilter2D(const oclMat &src, oclMat &dst, oclMat &mat_kernel,
                  Size &ksize, const Point anchor, const int borderType)
 {
     CV_Assert(src.clCxt == dst.clCxt);
-    CV_Assert( (src.cols == dst.cols) &&
-               (src.rows == dst.rows) );
-    CV_Assert( (src.oclchannels() == dst.oclchannels()) );
-    CV_Assert( (borderType != 0) );
+    CV_Assert((src.cols == dst.cols) &&
+              (src.rows == dst.rows));
+    CV_Assert((src.oclchannels() == dst.oclchannels()));
+    CV_Assert((borderType != 0));
     CV_Assert(ksize.height > 0 && ksize.width > 0 && ((ksize.height & 1) == 1) && ((ksize.width & 1) == 1));
     CV_Assert((anchor.x == -1 && anchor.y == -1) || (anchor.x == ksize.width >> 1 && anchor.y == ksize.height >> 1));
     Context *clCxt = src.clCxt;
@@ -703,20 +730,20 @@ void GPUFilter2D(const oclMat &src, oclMat &dst, oclMat &mat_kernel,
                               };
 
     vector< pair<size_t, const void *> > args;
-    args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src.step));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src_offset_x));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src_offset_y));
-    args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&dst_offset_x));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&dst_offset_y));
-    args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_kernel.data));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src.cols));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&cols));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src.wholecols));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src.wholerows));
+    args.push_back(make_pair(sizeof(cl_mem), (void *)&src.data));
+    args.push_back(make_pair(sizeof(cl_int), (void *)&src.step));
+    args.push_back(make_pair(sizeof(cl_int), (void *)&src_offset_x));
+    args.push_back(make_pair(sizeof(cl_int), (void *)&src_offset_y));
+    args.push_back(make_pair(sizeof(cl_mem), (void *)&dst.data));
+    args.push_back(make_pair(sizeof(cl_int), (void *)&dst.step));
+    args.push_back(make_pair(sizeof(cl_int), (void *)&dst_offset_x));
+    args.push_back(make_pair(sizeof(cl_int), (void *)&dst_offset_y));
+    args.push_back(make_pair(sizeof(cl_mem), (void *)&mat_kernel.data));
+    args.push_back(make_pair(sizeof(cl_int), (void *)&src.cols));
+    args.push_back(make_pair(sizeof(cl_int), (void *)&src.rows));
+    args.push_back(make_pair(sizeof(cl_int), (void *)&cols));
+    args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholecols));
+    args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholerows));
 
     openCLExecuteKernel(clCxt, &filtering_laplacian, kernelName, globalThreads, localThreads, args, cn, depth);
 }
@@ -750,8 +777,10 @@ Ptr<FilterEngine_GPU> cv::ocl::createLinearFilter_GPU(int srcType, int dstType,
 void cv::ocl::filter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &kernel, Point anchor, int borderType)
 {
 
-    if( ddepth < 0 )
+    if (ddepth < 0)
+    {
         ddepth = src.depth();
+    }
 
     dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels()));
 
@@ -782,10 +811,10 @@ public:
 
         int cn = src.oclchannels();
         //dst.create(src_size, src_type);
-        dst = Scalar(0.0);
+        //dst = Scalar(0.0);
         //dstBuf.create(src_size, src_type);
         dstBuf.create(src_size.height + ksize.height - 1, src_size.width, CV_MAKETYPE(CV_32F, cn));
-        dstBuf = Scalar(0.0);
+        //dstBuf = Scalar(0.0);
 
         normalizeROI(roi, ksize, anchor, src_size);
 
@@ -829,13 +858,14 @@ void GPUFilterBox_8u_C1R(const oclMat &src, oclMat &dst,
 
     CV_Assert(src.clCxt == dst.clCxt);
     CV_Assert((src.cols == dst.cols) &&
-              (src.rows == dst.rows) );
+              (src.rows == dst.rows));
     Context *clCxt = src.clCxt;
 
     string kernelName = "boxFilter_C1_D0";
 
     char btype[30];
-    switch(borderType)
+
+    switch (borderType)
     {
     case 0:
         sprintf(btype, "BORDER_CONSTANT");
@@ -890,13 +920,14 @@ void GPUFilterBox_8u_C4R(const oclMat &src, oclMat &dst,
 
     CV_Assert(src.clCxt == dst.clCxt);
     CV_Assert((src.cols == dst.cols) &&
-              (src.rows == dst.rows) );
+              (src.rows == dst.rows));
     Context *clCxt = src.clCxt;
 
     string kernelName = "boxFilter_C4_D0";
 
     char btype[30];
-    switch(borderType)
+
+    switch (borderType)
     {
     case 0:
         sprintf(btype, "BORDER_CONSTANT");
@@ -951,13 +982,14 @@ void GPUFilterBox_32F_C1R(const oclMat &src, oclMat &dst,
 
     CV_Assert(src.clCxt == dst.clCxt);
     CV_Assert((src.cols == dst.cols) &&
-              (src.rows == dst.rows) );
+              (src.rows == dst.rows));
     Context *clCxt = src.clCxt;
 
     string kernelName = "boxFilter_C1_D5";
 
     char btype[30];
-    switch(borderType)
+
+    switch (borderType)
     {
     case 0:
         sprintf(btype, "BORDER_CONSTANT");
@@ -1013,13 +1045,14 @@ void GPUFilterBox_32F_C4R(const oclMat &src, oclMat &dst,
 
     CV_Assert(src.clCxt == dst.clCxt);
     CV_Assert((src.cols == dst.cols) &&
-              (src.rows == dst.rows) );
+              (src.rows == dst.rows));
     Context *clCxt = src.clCxt;
 
     string kernelName = "boxFilter_C4_D5";
 
     char btype[30];
-    switch(borderType)
+
+    switch (borderType)
     {
     case 0:
         sprintf(btype, "BORDER_CONSTANT");
@@ -1095,8 +1128,11 @@ void cv::ocl::boxFilter(const oclMat &src, oclMat &dst, int ddepth, Size ksize,
                         Point anchor, int borderType)
 {
     int sdepth = src.depth(), cn = src.channels();
-    if( ddepth < 0 )
+
+    if (ddepth < 0)
+    {
         ddepth = sdepth;
+    }
 
     dst.create(src.size(), CV_MAKETYPE(ddepth, cn));
 
@@ -1161,7 +1197,8 @@ void linearRowFilter_gpu(const oclMat &src, const oclMat &dst, oclMat mat_kernel
     string kernelName = "row_filter";
 
     char btype[30];
-    switch(bordertype)
+
+    switch (bordertype)
     {
     case 0:
         sprintf(btype, "BORDER_CONSTANT");
@@ -1179,15 +1216,17 @@ void linearRowFilter_gpu(const oclMat &src, const oclMat &dst, oclMat mat_kernel
         sprintf(btype, "BORDER_REFLECT_101");
         break;
     }
+
     char compile_option[128];
     sprintf(compile_option, "-D RADIUSX=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s", anchor, localThreads[0], localThreads[1], channels, btype);
 
     size_t globalThreads[3];
     globalThreads[1] = (dst.rows + localThreads[1] - 1) / localThreads[1] * localThreads[1];
     globalThreads[2] = (1 + localThreads[2] - 1) / localThreads[2] * localThreads[2];
-    if(src.depth() == CV_8U)
+
+    if (src.depth() == CV_8U)
     {
-        switch(channels)
+        switch (channels)
         {
         case 1:
         case 3:
@@ -1205,6 +1244,7 @@ void linearRowFilter_gpu(const oclMat &src, const oclMat &dst, oclMat mat_kernel
     {
         globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0];
     }
+
     //sanity checks
     CV_Assert(clCxt == dst.clCxt);
     CV_Assert(src.cols == dst.cols);
@@ -1232,7 +1272,7 @@ void linearRowFilter_gpu(const oclMat &src, const oclMat &dst, oclMat mat_kernel
     args.push_back(make_pair(sizeof(cl_int), (void *)&ridusy));
     args.push_back(make_pair(sizeof(cl_mem), (void *)&mat_kernel.data));
 
-    openCLExecuteKernel(clCxt, &filter_sep_row, kernelName, globalThreads, localThreads, args, channels, src.depth(), compile_option);
+    openCLExecuteKernel2(clCxt, &filter_sep_row, kernelName, globalThreads, localThreads, args, channels, src.depth(), compile_option, CLFLUSH);
 }
 
 Ptr<BaseRowFilter_GPU> cv::ocl::getLinearRowFilter_GPU(int srcType, int /*bufType*/, const Mat &rowKernel, int anchor, int bordertype)
@@ -1289,7 +1329,8 @@ void linearColumnFilter_gpu(const oclMat &src, const oclMat &dst, oclMat mat_ker
     string kernelName = "col_filter";
 
     char btype[30];
-    switch(bordertype)
+
+    switch (bordertype)
     {
     case 0:
         sprintf(btype, "BORDER_CONSTANT");
@@ -1307,15 +1348,17 @@ void linearColumnFilter_gpu(const oclMat &src, const oclMat &dst, oclMat mat_ker
         sprintf(btype, "BORDER_REFLECT_101");
         break;
     }
+
     char compile_option[256];
 
 
     size_t globalThreads[3];
     globalThreads[1] = (dst.rows + localThreads[1] - 1) / localThreads[1] * localThreads[1];
     globalThreads[2] = (1 + localThreads[2] - 1) / localThreads[2] * localThreads[2];
-    if(dst.depth() == CV_8U)
+
+    if (dst.depth() == CV_8U)
     {
-        switch(channels)
+        switch (channels)
         {
         case 1:
             globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0];
@@ -1338,7 +1381,8 @@ void linearColumnFilter_gpu(const oclMat &src, const oclMat &dst, oclMat mat_ker
     else
     {
         globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0];
-        switch(dst.type())
+
+        switch (dst.type())
         {
         case CV_32SC1:
             sprintf(compile_option, "-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s",
@@ -1441,20 +1485,25 @@ Ptr<FilterEngine_GPU> cv::ocl::createSeparableLinearFilter_GPU(int srcType, int
 
 void cv::ocl::sepFilter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &kernelX, const Mat &kernelY, Point anchor, double delta, int bordertype)
 {
-    if((dst.cols != dst.wholecols) || (dst.rows != dst.wholerows)) //has roi
+    if ((dst.cols != dst.wholecols) || (dst.rows != dst.wholerows)) //has roi
     {
-        if((bordertype & cv::BORDER_ISOLATED) != 0)
+        if ((bordertype & cv::BORDER_ISOLATED) != 0)
         {
             bordertype &= ~cv::BORDER_ISOLATED;
-            if((bordertype != cv::BORDER_CONSTANT) &&
+
+            if ((bordertype != cv::BORDER_CONSTANT) &&
                     (bordertype != cv::BORDER_REPLICATE))
             {
                 CV_Error(CV_StsBadArg, "unsupported border type");
             }
         }
     }
-    if( ddepth < 0 )
+
+    if (ddepth < 0)
+    {
         ddepth = src.depth();
+    }
+
     //CV_Assert(ddepth == src.depth());
     dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels()));
 
@@ -1462,12 +1511,12 @@ void cv::ocl::sepFilter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat
     f->apply(src, dst);
 }
 
-Ptr<FilterEngine_GPU> cv::ocl::createDerivFilter_GPU( int srcType, int dstType, int dx, int dy, int ksize, int borderType )
+Ptr<FilterEngine_GPU> cv::ocl::createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, int borderType)
 {
     Mat kx, ky;
-    getDerivKernels( kx, ky, dx, dy, ksize, false, CV_32F );
+    getDerivKernels(kx, ky, dx, dy, ksize, false, CV_32F);
     return createSeparableLinearFilter_GPU(srcType, dstType,
-                                           kx, ky, Point(-1, -1), 0, borderType );
+                                           kx, ky, Point(-1, -1), 0, borderType);
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -1482,15 +1531,20 @@ void cv::ocl::Sobel(const oclMat &src, oclMat &dst, int ddepth, int dx, int dy,
         // usually the smoothing part is the slowest to compute,
         // so try to scale it instead of the faster differenciating part
         if (dx == 0)
+        {
             kx *= scale;
+        }
         else
+        {
             ky *= scale;
+        }
     }
+
     // Mat kx_, ky_;
     //ky.convertTo(ky_,CV_32S,1<<8);
     //kx.convertTo(kx_,CV_32S,1<<8);
 
-    sepFilter2D(src, dst, ddepth, kx, ky, Point(-1, -1), delta, borderType );
+    sepFilter2D(src, dst, ddepth, kx, ky, Point(-1, -1), delta, borderType);
 }
 
 void cv::ocl::Scharr(const oclMat &src, oclMat &dst, int ddepth, int dx, int dy, double scale, double delta , int bordertype)
@@ -1498,14 +1552,18 @@ void cv::ocl::Scharr(const oclMat &src, oclMat &dst, int ddepth, int dx, int dy,
     Mat kx, ky;
     getDerivKernels(kx, ky, dx, dy, -1, false, CV_32F);
 
-    if( scale != 1 )
+    if (scale != 1)
     {
         // usually the smoothing part is the slowest to compute,
         // so try to scale it instead of the faster differenciating part
-        if( dx == 0 )
+        if (dx == 0)
+        {
             kx *= scale;
+        }
         else
+        {
             ky *= scale;
+        }
     }
 
     // Mat kx_, ky_;
@@ -1517,7 +1575,7 @@ void cv::ocl::Scharr(const oclMat &src, oclMat &dst, int ddepth, int dx, int dy,
 
 void cv::ocl::Laplacian(const oclMat &src, oclMat &dst, int ddepth, int ksize, double scale)
 {
-    if(src.clCxt -> impl -> double_support == 0 && src.type() == CV_64F)
+    if (src.clCxt -> impl -> double_support == 0 && src.type() == CV_64F)
     {
         CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
         return;
@@ -1531,8 +1589,12 @@ void cv::ocl::Laplacian(const oclMat &src, oclMat &dst, int ddepth, int ksize, d
         {2, 0, 2, 0, -8, 0, 2, 0, 2}
     };
     Mat kernel(3, 3, CV_32S, (void *)K[ksize == 3]);
+
     if (scale != 1)
+    {
         kernel *= scale;
+    }
+
     filter2D(src, dst, ddepth, kernel, Point(-1, -1));
 }
 
@@ -1544,25 +1606,38 @@ Ptr<FilterEngine_GPU> cv::ocl::createGaussianFilter_GPU(int type, Size ksize, do
     int depth = CV_MAT_DEPTH(type);
 
     if (sigma2 <= 0)
+    {
         sigma2 = sigma1;
+    }
 
     // automatic detection of kernel size from sigma
     if (ksize.width <= 0 && sigma1 > 0)
+    {
         ksize.width = cvRound(sigma1 * (depth == CV_8U ? 3 : 4) * 2 + 1) | 1;
-    if (ksize.height <= 0 && sigma2 > 0)
-        ksize.height = cvRound(sigma2 * (depth == CV_8U ? 3 : 4) * 2 + 1) | 1;
+    }
 
-    CV_Assert( ksize.width > 0 && ksize.width % 2 == 1 && ksize.height > 0 && ksize.height % 2 == 1 );
+    if (ksize.height <= 0 && sigma2 > 0)
+    {
+        ksize.height = cvRound(sigma2 * (depth == CV_8U ? 3 : 4) * 2 + 1) | 1;
+    }
+
+    CV_Assert(ksize.width > 0 && ksize.width % 2 == 1 && ksize.height > 0 && ksize.height % 2 == 1);
 
     sigma1 = std::max(sigma1, 0.0);
     sigma2 = std::max(sigma2, 0.0);
 
-    Mat kx = getGaussianKernel( ksize.width, sigma1, std::max(depth, CV_32F) );
+    Mat kx = getGaussianKernel(ksize.width, sigma1, std::max(depth, CV_32F));
     Mat ky;
-    if( ksize.height == ksize.width && std::abs(sigma1 - sigma2) < DBL_EPSILON )
+
+    if (ksize.height == ksize.width && std::abs(sigma1 - sigma2) < DBL_EPSILON)
+    {
         ky = kx;
+    }
     else
-        ky = getGaussianKernel( ksize.height, sigma2, std::max(depth, CV_32F) );
+    {
+        ky = getGaussianKernel(ksize.height, sigma2, std::max(depth, CV_32F));
+    }
+
     //Mat kx_, ky_;
     //kx.convertTo(kx_,CV_32S,1<<8);
     //ky.convertTo(ky_,CV_32S,1<<8);
@@ -1576,26 +1651,36 @@ void cv::ocl::GaussianBlur(const oclMat &src, oclMat &dst, Size ksize, double si
         src.copyTo(dst);
         return;
     }
-    if((dst.cols != dst.wholecols) || (dst.rows != dst.wholerows)) //has roi
+
+    if ((dst.cols != dst.wholecols) || (dst.rows != dst.wholerows)) //has roi
     {
-        if((bordertype & cv::BORDER_ISOLATED) != 0)
+        if ((bordertype & cv::BORDER_ISOLATED) != 0)
         {
             bordertype &= ~cv::BORDER_ISOLATED;
-            if((bordertype != cv::BORDER_CONSTANT) &&
+
+            if ((bordertype != cv::BORDER_CONSTANT) &&
                     (bordertype != cv::BORDER_REPLICATE))
             {
                 CV_Error(CV_StsBadArg, "unsupported border type");
             }
         }
     }
+
     dst.create(src.size(), src.type());
-    if( bordertype != BORDER_CONSTANT )
+
+    if (bordertype != BORDER_CONSTANT)
     {
-        if( src.rows == 1 )
+        if (src.rows == 1)
+        {
             ksize.height = 1;
-        if( src.cols == 1 )
+        }
+
+        if (src.cols == 1)
+        {
             ksize.width = 1;
+        }
     }
+
     Ptr<FilterEngine_GPU> f = createGaussianFilter_GPU(src.type(), ksize, sigma1, sigma2, bordertype);
     f->apply(src, dst);
 }