From d64fb235f75226c0df6d9bd5b812ce7abe4a63a7 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Tue, 24 Sep 2013 13:15:05 +0400
Subject: [PATCH 01/17] removed needless arithm_2_mat.cl file

---
 modules/ocl/src/opencl/arithm_2_mat.cl | 158 -------------------------
 1 file changed, 158 deletions(-)
 delete mode 100644 modules/ocl/src/opencl/arithm_2_mat.cl

diff --git a/modules/ocl/src/opencl/arithm_2_mat.cl b/modules/ocl/src/opencl/arithm_2_mat.cl
deleted file mode 100644
index 63c1ccac0..000000000
--- a/modules/ocl/src/opencl/arithm_2_mat.cl
+++ /dev/null
@@ -1,158 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Shengen Yan,yanshengen@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other oclMaterials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-/**************************************PUBLICFUNC*************************************/
-#if defined (DOUBLE_SUPPORT)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-
-#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable
-#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics:enable
-#define CV_PI   3.1415926535897932384626433832795
-
-char round_char(double v){
-    char v1=(char)v;
-    return convert_char_sat(v+(v>=0 ? 0.5 : -0.5));
-}
-unsigned char round_uchar(double v){
-    unsigned char v1=(unsigned char)v;
-    return convert_uchar_sat(v+(v>=0 ? 0.5 : -0.5));
-}
-short round_short(double v){
-    short v1=(short)v;
-    return convert_short_sat(v+(v>=0 ? 0.5 : -0.5));
-}
-unsigned short round_ushort(double v){
-    unsigned short v1=(unsigned short)v;
-    return convert_ushort_sat(v+(v>=0 ? 0.5 : -0.5));
-}
-int round_int(double v){
-    int v1=(int)v;
-    return convert_int_sat(v+(v>=0 ? 0.5 : -0.5));
-}
-
-char round2_char(double v){
-    char v1=(char)v;
-    if((v-v1)==0.5&&v1%2==0)
-        return v1;
-    else
-        return convert_char_sat(v+(v>=0 ? 0.5 : -0.5));
-}
-unsigned char round2_uchar(double v){
-    unsigned char v1=(unsigned char)v;
-    if((v-v1)==0.5&&v1%2==0)
-        return v1;
-    else
-        return convert_uchar_sat(v+(v>=0 ? 0.5 : -0.5));
-}
-short round2_short(double v){
-    short v1=(short)v;
-    if((v-v1)==0.5&&v1%2==0)
-        return v1;
-    else
-        return convert_short_sat(v+(v>=0 ? 0.5 : -0.5));
-}
-unsigned short round2_ushort(double v){
-    unsigned short v1=(unsigned short)v;
-    if((v-v1)==0.5&&v1%2==0)
-        return v1;
-    else
-        return convert_ushort_sat(v+(v>=0 ? 0.5 : -0.5));
-}
-int round2_int(double v){
-    int v1=(int)v;
-    if((v-v1)==0.5&&v1%2==0)
-        return v1;
-    else
-        return convert_int_sat(v+(v>=0 ? 0.5 : -0.5));
-}
-
-/*****************************************EXP***************************************/
-__kernel void arithm_op_exp_5 (int rows,int cols,int srcStep,__global float *src1Mat,
-                             __global float * dstMat,int channels)
-{
-    size_t x = get_global_id(0);
-    size_t y = get_global_id(1);
-    if (x < cols && y < rows)
-    {
-        size_t idx = y * ( srcStep >> 2 ) + x;
-        dstMat[idx] = (float)exp((float)src1Mat[idx]);
-    }
-}
-__kernel void arithm_op_exp_6 (int rows,int cols,int srcStep,__global double *src1Mat,
-                             __global double * dstMat,int channels)
-{
-    size_t x = get_global_id(0);
-    size_t y = get_global_id(1);
-    if (x < cols && y < rows)
-    {
-        size_t idx = y * ( srcStep >> 3 ) + x;
-        dstMat[idx] = exp(src1Mat[idx]);
-    }
-}
-
-/*****************************************LOG***************************************/
-__kernel void arithm_op_log_5 (int rows,int cols,int srcStep,__global float *src1Mat,
-                             __global float * dstMat,int channels)
-{
-    size_t x = get_global_id(0);
-    size_t y = get_global_id(1);
-    if (x < cols && y < rows)
-    {
-        size_t idx = y * ( srcStep >> 2 ) + x;
-        dstMat[idx] =(float) log((float)src1Mat[idx]);
-    }
-}
-__kernel void arithm_op_log_6 (int rows,int cols,int srcStep,__global double *src1Mat,
-                             __global double * dstMat,int channels)
-{
-    size_t x = get_global_id(0);
-    size_t y = get_global_id(1);
-    if (x < cols && y < rows)
-    {
-        size_t idx = y * ( srcStep >> 3 ) + x;
-        dstMat[idx] = log(src1Mat[idx]);
-    }
-}

From 5ff5fdd73de147c014c4780e2aa90ea20209ed6f Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Tue, 24 Sep 2013 13:17:09 +0400
Subject: [PATCH 02/17] marked some methods of ocl::Context as const

---
 modules/ocl/include/opencv2/ocl/ocl.hpp | 4 ++--
 modules/ocl/src/initialization.cpp      | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/modules/ocl/include/opencv2/ocl/ocl.hpp b/modules/ocl/include/opencv2/ocl/ocl.hpp
index dc9183acb..361e29251 100644
--- a/modules/ocl/include/opencv2/ocl/ocl.hpp
+++ b/modules/ocl/include/opencv2/ocl/ocl.hpp
@@ -158,8 +158,8 @@ namespace cv
             static void setContext(Info &oclinfo);
 
             enum {CL_DOUBLE, CL_UNIFIED_MEM, CL_VER_1_2};
-            bool supportsFeature(int ftype);
-            size_t computeUnits();
+            bool supportsFeature(int ftype) const;
+            size_t computeUnits() const;
             void* oclContext();
             void* oclCommandQueue();
         };
diff --git a/modules/ocl/src/initialization.cpp b/modules/ocl/src/initialization.cpp
index 34d5ff5e6..c18984b07 100644
--- a/modules/ocl/src/initialization.cpp
+++ b/modules/ocl/src/initialization.cpp
@@ -1013,7 +1013,7 @@ namespace cv
             programCache->releaseProgram();
         }
 
-        bool Context::supportsFeature(int ftype)
+        bool Context::supportsFeature(int ftype) const
         {
             switch(ftype)
             {
@@ -1028,7 +1028,7 @@ namespace cv
             }
         }
 
-        size_t Context::computeUnits()
+        size_t Context::computeUnits() const
         {
             return impl->maxComputeUnits;
         }

From 0ad03162dff9dc500fb1d80cdf33e7c9e00cc961 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Tue, 24 Sep 2013 13:34:55 +0400
Subject: [PATCH 03/17] refactored and extended arithm operations
 add/sub/mul/div/absdiff

---
 modules/ocl/include/opencv2/ocl/ocl.hpp       |  19 +-
 modules/ocl/src/arithm.cpp                    | 455 +++-------
 modules/ocl/src/opencl/arithm_add.cl          | 812 ++----------------
 modules/ocl/src/opencl/arithm_add_mask.cl     |  79 ++
 modules/ocl/src/opencl/arithm_add_scalar.cl   | 464 +---------
 .../ocl/src/opencl/arithm_add_scalar_mask.cl  | 563 +-----------
 modules/ocl/src/opencl/arithm_div.cl          | 468 ----------
 modules/ocl/src/opencl/arithm_mul.cl          | 303 -------
 8 files changed, 307 insertions(+), 2856 deletions(-)
 create mode 100644 modules/ocl/src/opencl/arithm_add_mask.cl
 delete mode 100644 modules/ocl/src/opencl/arithm_div.cl
 delete mode 100644 modules/ocl/src/opencl/arithm_mul.cl

diff --git a/modules/ocl/include/opencv2/ocl/ocl.hpp b/modules/ocl/include/opencv2/ocl/ocl.hpp
index 361e29251..2bfc7db45 100644
--- a/modules/ocl/include/opencv2/ocl/ocl.hpp
+++ b/modules/ocl/include/opencv2/ocl/ocl.hpp
@@ -409,40 +409,37 @@ namespace cv
         CV_EXPORTS void split(const oclMat &src, vector<oclMat> &dst);
 
         ////////////////////////////// Arithmetics ///////////////////////////////////
+
         //#if defined DOUBLE_SUPPORT
         //typedef double F;
         //#else
         //typedef float F;
         //#endif
+
         //	CV_EXPORTS void addWeighted(const oclMat& a,F  alpha, const oclMat& b,F beta,F gama, oclMat& c);
         CV_EXPORTS void addWeighted(const oclMat &a, double  alpha, const oclMat &b, double beta, double gama, oclMat &c);
+
         //! adds one matrix to another (c = a + b)
         // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
-        CV_EXPORTS void add(const oclMat &a, const oclMat &b, oclMat &c);
-        //! adds one matrix to another (c = a + b)
-        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
-        CV_EXPORTS void add(const oclMat &a, const oclMat &b, oclMat &c, const oclMat &mask);
+        CV_EXPORTS void add(const oclMat &a, const oclMat &b, oclMat &c, const oclMat &mask = oclMat());
         //! adds scalar to a matrix (c = a + s)
         // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
         CV_EXPORTS void add(const oclMat &a, const Scalar &sc, oclMat &c, const oclMat &mask = oclMat());
+
         //! subtracts one matrix from another (c = a - b)
         // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
-        CV_EXPORTS void subtract(const oclMat &a, const oclMat &b, oclMat &c);
-        //! subtracts one matrix from another (c = a - b)
-        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
-        CV_EXPORTS void subtract(const oclMat &a, const oclMat &b, oclMat &c, const oclMat &mask);
+        CV_EXPORTS void subtract(const oclMat &a, const oclMat &b, oclMat &c, const oclMat &mask = oclMat());
         //! subtracts scalar from a matrix (c = a - s)
         // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
         CV_EXPORTS void subtract(const oclMat &a, const Scalar &sc, oclMat &c, const oclMat &mask = oclMat());
-        //! subtracts scalar from a matrix (c = a - s)
-        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
-        CV_EXPORTS void subtract(const Scalar &sc, const oclMat &a, oclMat &c, const oclMat &mask = oclMat());
+
         //! computes element-wise product of the two arrays (c = a * b)
         // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
         CV_EXPORTS void multiply(const oclMat &a, const oclMat &b, oclMat &c, double scale = 1);
         //! multiplies matrix to a number (dst = scalar * src)
         // supports CV_32FC1 only
         CV_EXPORTS void multiply(double scalar, const oclMat &src, oclMat &dst);
+
         //! computes element-wise quotient of the two arrays (c = a / b)
         // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
         CV_EXPORTS void divide(const oclMat &a, const oclMat &b, oclMat &c, double scale = 1);
diff --git a/modules/ocl/src/arithm.cpp b/modules/ocl/src/arithm.cpp
index 0cc803d19..03c314c7c 100644
--- a/modules/ocl/src/arithm.cpp
+++ b/modules/ocl/src/arithm.cpp
@@ -62,11 +62,11 @@ namespace cv
 {
     namespace ocl
     {
-        ////////////////////////////////OpenCL kernel strings/////////////////////
+        //////////////////////////////// OpenCL kernel strings /////////////////////
+
         extern const char *transpose_kernel;
         extern const char *arithm_nonzero;
         extern const char *arithm_sum;
-        extern const char *arithm_2_mat;
         extern const char *arithm_sum_3;
         extern const char *arithm_minMax;
         extern const char *arithm_minMax_mask;
@@ -74,6 +74,7 @@ namespace cv
         extern const char *arithm_minMaxLoc_mask;
         extern const char *arithm_LUT;
         extern const char *arithm_add;
+        extern const char *arithm_add_mask;
         extern const char *arithm_add_scalar;
         extern const char *arithm_add_scalar_mask;
         extern const char *arithm_bitwise_binary;
@@ -83,9 +84,7 @@ namespace cv
         extern const char *arithm_bitwise_not;
         extern const char *arithm_compare_eq;
         extern const char *arithm_compare_ne;
-        extern const char *arithm_mul;
-        extern const char *arithm_div;
-        extern const char *arithm_absdiff;
+        extern const char *arithm_magnitudeSqr;
         extern const char *arithm_transpose;
         extern const char *arithm_flip;
         extern const char *arithm_flip_rc;
@@ -97,390 +96,176 @@ namespace cv
         extern const char *arithm_addWeighted;
         extern const char *arithm_phase;
         extern const char *arithm_pow;
-        extern const char *arithm_magnitudeSqr;
         extern const char *arithm_setidentity;
-        //extern const char * jhp_transpose_kernel;
-        int64 kernelrealtotal = 0;
-        int64 kernelalltotal = 0;
-        int64 reducetotal = 0;
-        int64 downloadtotal = 0;
-        int64 alltotal = 0;
     }
 }
+//////////////////////////////////////////////////////////////////////////////
+/////////////////////// add subtract multiply divide /////////////////////////
+//////////////////////////////////////////////////////////////////////////////
 
 //////////////////////////////////////////////////////////////////////////////
 /////////////////////// add subtract multiply divide /////////////////////////
 //////////////////////////////////////////////////////////////////////////////
-template<typename T>
-void arithmetic_run(const oclMat &src1, const oclMat &src2, oclMat &dst,
-                    string kernelName, const char **kernelString, void *_scalar, int op_type = 0)
+
+enum { ADD = 0, SUB, MUL, DIV, ABS_DIFF };
+
+static void arithmetic_run_generic(const oclMat &src1, const oclMat &src2, const Scalar & scalar, const oclMat & mask,
+                            oclMat &dst, int op_type, bool use_scalar = false)
 {
-    if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
+    Context *clCxt = src1.clCxt;
+    bool hasDouble = clCxt->supportsFeature(Context::CL_DOUBLE);
+    if (!hasDouble && (src1.depth() == CV_64F || src2.depth() == CV_64F || dst.depth() == CV_64F))
     {
-        CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
+        CV_Error(CV_GpuNotSupported, "Selected device doesn't support double\r\n");
         return;
     }
 
-    dst.create(src1.size(), src1.type());
-    CV_Assert(src1.cols == src2.cols && src2.cols == dst.cols &&
-              src1.rows == src2.rows && src2.rows == dst.rows);
-
-    CV_Assert(src1.type() == src2.type() && src1.type() == dst.type());
-    CV_Assert(src1.depth() != CV_8S);
-
-    Context  *clCxt = src1.clCxt;
-    int channels = dst.oclchannels();
-    int depth = dst.depth();
-
-    int vector_lengths[4][7] = {{4, 0, 4, 4, 1, 1, 1},
-        {4, 0, 4, 4, 1, 1, 1},
-        {4, 0, 4, 4, 1, 1, 1},
-        {4, 0, 4, 4, 1, 1, 1}
-    };
-
-    size_t vector_length = vector_lengths[channels - 1][depth];
-    int offset_cols = (dst.offset / dst.elemSize1()) & (vector_length - 1);
-    int cols = divUp(dst.cols * channels + offset_cols, vector_length);
-
-    size_t localThreads[3]  = { 64, 4, 1 };
-    size_t globalThreads[3] = { cols, dst.rows, 1 };
-
-    int dst_step1 = dst.cols * dst.elemSize();
-    vector<pair<size_t , const void *> > args;
-    args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src1.step ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src1.offset ));
-    args.push_back( make_pair( sizeof(cl_mem), (void *)&src2.data ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src2.step ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src2.offset ));
-    args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src1.rows ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&cols ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 ));
-    T scalar;
-    if(_scalar != NULL)
-    {
-        double scalar1 = *((double *)_scalar);
-        scalar = (T)scalar1;
-        args.push_back( make_pair( sizeof(T), (void *)&scalar ));
-    }
-    switch(op_type)
-    {
-        case MAT_ADD:
-            openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth, "-D ARITHM_ADD");
-            break;
-        case MAT_SUB:
-            openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth, "-D ARITHM_SUB");
-            break;
-        default:
-            openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth);
-    }
-}
-static void arithmetic_run(const oclMat &src1, const oclMat &src2, oclMat &dst,
-                           string kernelName, const char **kernelString, int op_type = 0)
-{
-    arithmetic_run<char>(src1, src2, dst, kernelName, kernelString, (void *)NULL, op_type);
-}
-static void arithmetic_run(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask,
-                           string kernelName, const char **kernelString, int op_type = 0)
-{
-    if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
-    {
-        CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
-        return;
-    }
+    CV_Assert(src2.empty() || (!src2.empty() && src1.type() == src2.type() && src1.size() == src2.size()));
+    CV_Assert(mask.empty() || (!mask.empty() && mask.type() == CV_8UC1 && mask.size() == src1.size()));
+    CV_Assert(op_type >= ADD && op_type <= ABS_DIFF);
 
     dst.create(src1.size(), src1.type());
-    CV_Assert(src1.cols == src2.cols && src2.cols == dst.cols &&
-              src1.rows == src2.rows && src2.rows == dst.rows &&
-              src1.rows == mask.rows && src1.cols == mask.cols);
 
-    CV_Assert(src1.type() == src2.type() && src1.type() == dst.type());
-    CV_Assert(src1.depth() != CV_8S);
-    CV_Assert(mask.type() == CV_8U);
+    int oclChannels = src1.oclchannels(), depth = src1.depth();
+    int src1step1 = src1.step / src1.elemSize(), src1offset1 = src1.offset / src1.elemSize();
+    int src2step1 = src2.step / src2.elemSize(), src2offset1 = src2.offset / src2.elemSize();
+    int maskstep1 = mask.step, maskoffset1 = mask.offset / mask.elemSize();
+    int dststep1 = dst.step / dst.elemSize(), dstoffset1 = dst.offset / dst.elemSize();
+    oclMat m;
 
-    Context  *clCxt = src1.clCxt;
-    int channels = dst.oclchannels();
-    int depth = dst.depth();
+    size_t localThreads[3]  = { 16, 16, 1 };
+    size_t globalThreads[3] = { dst.cols, dst.rows, 1 };
 
-    int vector_lengths[4][7] = {{4, 4, 2, 2, 1, 1, 1},
-        {2, 2, 1, 1, 1, 1, 1},
-        {4, 4, 2, 2 , 1, 1, 1},
-        {1, 1, 1, 1, 1, 1, 1}
-    };
+    std::string kernelName = op_type == ABS_DIFF ? "arithm_absdiff" : "arithm_binary_op";
 
-    size_t vector_length = vector_lengths[channels - 1][depth];
-    int offset_cols = ((dst.offset % dst.step) / dst.elemSize()) & (vector_length - 1);
-    int cols = divUp(dst.cols + offset_cols, vector_length);
+    const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
+    const char * const WTypeMap[] = { "short", "short", "int", "int", "int", "float", "double" };
+    const char operationsMap[] = { '+', '-', '*', '/', '-' };
+    const char * const channelMap[] = { "", "", "2", "4", "4" };
+    bool haveScalar = use_scalar || src2.empty();
 
-    size_t localThreads[3]  = { 64, 4, 1 };
-    size_t globalThreads[3] = { cols, dst.rows, 1 };
+    int WDepth = depth;
+    if (haveScalar)
+        WDepth = hasDouble && WDepth == CV_64F ? CV_64F : CV_32F;
+    if (op_type == DIV)
+        WDepth = hasDouble ? CV_64F : CV_32F;
+    else if (op_type == MUL)
+        WDepth = hasDouble && (depth == CV_32S || depth == CV_64F) ? CV_64F : CV_32F;
+
+    std::string buildOptions = format("-D T=%s%s -D WT=%s%s -D convertToT=convert_%s%s%s -D Operation=%c"
+                                      " -D convertToWT=convert_%s%s",
+                                      typeMap[depth], channelMap[oclChannels],
+                                      WTypeMap[WDepth], channelMap[oclChannels],
+                                      typeMap[depth], channelMap[oclChannels], (depth >= CV_32F ? "" : (depth == CV_32S ? "_rte" : "_sat_rte")),
+                                      operationsMap[op_type], WTypeMap[WDepth], channelMap[oclChannels]);
 
-    int dst_step1 = dst.cols * dst.elemSize();
     vector<pair<size_t , const void *> > args;
     args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src1.step ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src1.offset ));
-    args.push_back( make_pair( sizeof(cl_mem), (void *)&src2.data ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src2.step ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src2.offset ));
-    args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&mask.step ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&mask.offset ));
-    args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src1.rows ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&cols ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src1step1 ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src1offset1 ));
 
-    switch (op_type)
+    if (!src2.empty())
     {
-        case MAT_ADD:
-            openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, channels, depth, "-D ARITHM_ADD");
-            break;
-        case MAT_SUB:
-            openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, channels, depth, "-D ARITHM_SUB");
-            break;
-        default:
-            openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, channels, depth);
+        args.push_back( make_pair( sizeof(cl_mem), (void *)&src2.data ));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&src2step1 ));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&src2offset1 ));
+
+        kernelName += "_mat";
     }
+
+    if (haveScalar)
+    {
+        const int WDepthMap[] = { CV_16S, CV_16S, CV_32S, CV_32S, CV_32S, CV_32F, CV_64F };
+        m.create(1, 1, CV_MAKE_TYPE(WDepthMap[WDepth], oclChannels));
+        m.setTo(scalar);
+
+        args.push_back( make_pair( sizeof(cl_mem), (void *)&m.data ));
+
+        kernelName += "_scalar";
+    }
+
+    if (!mask.empty())
+    {
+        args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data ));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&maskstep1 ));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&maskoffset1 ));
+
+        kernelName += "_mask";
+    }
+
+    if (op_type == DIV)
+        kernelName += "_div";
+
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&dststep1 ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&dstoffset1 ));
+
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src1.cols ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src1.rows ));
+
+    openCLExecuteKernel(clCxt, mask.empty() ?
+                            (!src2.empty() ? &arithm_add : &arithm_add_scalar) :
+                            (!src2.empty() ? &arithm_add_mask : &arithm_add_scalar_mask),
+                        kernelName, globalThreads, localThreads,
+                        args, -1, -1, buildOptions.c_str());
 }
-void cv::ocl::add(const oclMat &src1, const oclMat &src2, oclMat &dst)
-{
-    arithmetic_run(src1, src2, dst, "arithm_add", &arithm_add, MAT_ADD);
-}
+
 void cv::ocl::add(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask)
 {
-    arithmetic_run(src1, src2, dst, mask, "arithm_add_with_mask", &arithm_add, MAT_ADD);
-}
-
-void cv::ocl::subtract(const oclMat &src1, const oclMat &src2, oclMat &dst)
-{
-    arithmetic_run(src1, src2, dst, "arithm_add", &arithm_add, MAT_SUB);
-}
-void cv::ocl::subtract(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask)
-{
-    arithmetic_run(src1, src2, dst, mask, "arithm_add_with_mask", &arithm_add, MAT_SUB);
-}
-typedef void (*MulDivFunc)(const oclMat &src1, const oclMat &src2, oclMat &dst, string kernelName,
-                           const char **kernelString, void *scalar);
-
-void cv::ocl::multiply(const oclMat &src1, const oclMat &src2, oclMat &dst, double scalar)
-{
-    if(src1.clCxt->supportsFeature(Context::CL_DOUBLE) && (src1.depth() == CV_64F))
-        arithmetic_run<double>(src1, src2, dst, "arithm_mul", &arithm_mul, (void *)(&scalar));
-    else
-        arithmetic_run<float>(src1, src2, dst, "arithm_mul", &arithm_mul, (void *)(&scalar));
-}
-
-void cv::ocl::divide(const oclMat &src1, const oclMat &src2, oclMat &dst, double scalar)
-{
-
-    if(src1.clCxt->supportsFeature(Context::CL_DOUBLE))
-        arithmetic_run<double>(src1, src2, dst, "arithm_div", &arithm_div, (void *)(&scalar));
-    else
-        arithmetic_run<float>(src1, src2, dst, "arithm_div", &arithm_div, (void *)(&scalar));
-
-}
-template <typename WT , typename CL_WT>
-void arithmetic_scalar_run(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask, string kernelName, const char **kernelString, int isMatSubScalar)
-{
-    if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
-    {
-        CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
-        return;
-    }
-
-    dst.create(src1.size(), src1.type());
-
-    CV_Assert(src1.cols == dst.cols && src1.rows == dst.rows &&
-              src1.type() == dst.type());
-
-    //CV_Assert(src1.depth() != CV_8S);
-
-    if(mask.data)
-    {
-        CV_Assert(mask.type() == CV_8U && src1.rows == mask.rows && src1.cols == mask.cols);
-    }
-
-    Context  *clCxt = src1.clCxt;
-    int channels = dst.oclchannels();
-    int depth = dst.depth();
-
-    WT s[4] = { saturate_cast<WT>(src2.val[0]), saturate_cast<WT>(src2.val[1]),
-                saturate_cast<WT>(src2.val[2]), saturate_cast<WT>(src2.val[3])
-              };
-
-    int vector_lengths[4][7] = {{4, 0, 2, 2, 1, 1, 1},
-        {2, 0, 1, 1, 1, 1, 1},
-        {4, 0, 2, 2 , 1, 1, 1},
-        {1, 0, 1, 1, 1, 1, 1}
-    };
-
-    size_t vector_length = vector_lengths[channels - 1][depth];
-    int offset_cols = ((dst.offset % dst.step) / dst.elemSize()) & (vector_length - 1);
-    int cols = divUp(dst.cols + offset_cols, vector_length);
-
-    size_t localThreads[3]  = { 64, 4, 1 };
-    size_t globalThreads[3] = { cols, dst.rows, 1 };
-
-    int dst_step1 = dst.cols * dst.elemSize();
-    vector<pair<size_t , const void *> > args;
-    args.push_back( make_pair( sizeof(cl_mem) , (void *)&src1.data ));
-    args.push_back( make_pair( sizeof(cl_int) , (void *)&src1.step ));
-    args.push_back( make_pair( sizeof(cl_int) , (void *)&src1.offset));
-    args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data ));
-    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.step ));
-    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.offset));
-
-    if(mask.data)
-    {
-        args.push_back( make_pair( sizeof(cl_mem) , (void *)&mask.data ));
-        args.push_back( make_pair( sizeof(cl_int) , (void *)&mask.step ));
-        args.push_back( make_pair( sizeof(cl_int) , (void *)&mask.offset));
-    }
-    args.push_back( make_pair( sizeof(CL_WT) ,  (void *)&s ));
-    args.push_back( make_pair( sizeof(cl_int) , (void *)&src1.rows ));
-    args.push_back( make_pair( sizeof(cl_int) , (void *)&cols ));
-    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_step1 ));
-    if(isMatSubScalar != 0)
-        openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, channels, depth, "-D ARITHM_SUB");
-    else
-        openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, channels, depth, "-D ARITHM_ADD");
-}
-
-static void arithmetic_scalar_run(const oclMat &src, oclMat &dst, string kernelName, const char **kernelString, double scalar)
-{
-    if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F)
-    {
-        CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
-        return;
-    }
-
-    dst.create(src.size(), src.type());
-    CV_Assert(src.cols == dst.cols && src.rows == dst.rows);
-
-    CV_Assert(src.type() == dst.type());
-    CV_Assert(src.depth() != CV_8S);
-
-    Context  *clCxt = src.clCxt;
-    int channels = dst.oclchannels();
-    int depth = dst.depth();
-
-    int vector_lengths[4][7] = {{4, 0, 4, 4, 1, 1, 1},
-        {4, 0, 4, 4, 1, 1, 1},
-        {4, 0, 4, 4 , 1, 1, 1},
-        {4, 0, 4, 4, 1, 1, 1}
-    };
-
-    size_t vector_length = vector_lengths[channels - 1][depth];
-    int offset_cols = (dst.offset / dst.elemSize1()) & (vector_length - 1);
-    int cols = divUp(dst.cols * channels + offset_cols, vector_length);
-
-    size_t localThreads[3]  = { 64, 4, 1 };
-    size_t globalThreads[3] = { cols, dst.rows, 1 };
-
-    int dst_step1 = dst.cols * dst.elemSize();
-    vector<pair<size_t , const void *> > args;
-    args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src.step ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src.offset ));
-    args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&cols ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 ));
-
-    float f_scalar = (float)scalar;
-    if(src.clCxt->supportsFeature(Context::CL_DOUBLE))
-        args.push_back( make_pair( sizeof(cl_double), (void *)&scalar ));
-    else
-    {
-        args.push_back( make_pair( sizeof(cl_float), (void *)&f_scalar));
-    }
-
-    openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth);
-}
-
-typedef void (*ArithmeticFuncS)(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask, string kernelName, const char **kernelString, int isMatSubScalar);
-
-
-static void arithmetic_scalar(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask, string kernelName, const char **kernelString, int isMatSubScalar)
-{
-    static ArithmeticFuncS tab[8] =
-    {
-        arithmetic_scalar_run<int, cl_int4>,
-        arithmetic_scalar_run<int, cl_int4>,
-        arithmetic_scalar_run<int, cl_int4>,
-        arithmetic_scalar_run<int, cl_int4>,
-        arithmetic_scalar_run<int, cl_int4>,
-        arithmetic_scalar_run<float, cl_float4>,
-        arithmetic_scalar_run<double, cl_double4>,
-        0
-    };
-    ArithmeticFuncS func = tab[src1.depth()];
-    if(func == 0)
-        cv::ocl::error("Unsupported arithmetic operation", __FILE__, __LINE__);
-    func(src1, src2, dst, mask, kernelName, kernelString, isMatSubScalar);
-}
-static void arithmetic_scalar(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask, string kernelName, const char **kernelString)
-{
-    arithmetic_scalar(src1, src2, dst, mask, kernelName, kernelString, 0);
+    arithmetic_run_generic(src1, src2, Scalar(), mask, dst, ADD);
 }
 
 void cv::ocl::add(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask)
 {
-    string kernelName = mask.data ? "arithm_s_add_with_mask" : "arithm_s_add";
-    const char **kernelString = mask.data ? &arithm_add_scalar_mask : &arithm_add_scalar;
+    arithmetic_run_generic(src1, oclMat(), src2, mask, dst, ADD);
+}
 
-    arithmetic_scalar( src1, src2, dst, mask, kernelName, kernelString);
+void cv::ocl::subtract(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask)
+{
+    arithmetic_run_generic(src1, src2, Scalar(), mask, dst, SUB);
 }
 
 void cv::ocl::subtract(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask)
 {
-    string kernelName = mask.data ? "arithm_s_add_with_mask" : "arithm_s_add";
-    const char **kernelString = mask.data ? &arithm_add_scalar_mask : &arithm_add_scalar;
-    arithmetic_scalar( src1, src2, dst, mask, kernelName, kernelString, 1);
+    arithmetic_run_generic(src1, oclMat(), src2, mask, dst, SUB);
 }
-void cv::ocl::subtract(const Scalar &src2, const oclMat &src1, oclMat &dst, const oclMat &mask)
+
+void cv::ocl::multiply(const oclMat &src1, const oclMat &src2, oclMat &dst, double scalar)
 {
-    string kernelName = mask.data ? "arithm_s_add_with_mask" : "arithm_s_add";
-    const char **kernelString = mask.data ? &arithm_add_scalar_mask : &arithm_add_scalar;
-    arithmetic_scalar( src1, src2, dst, mask, kernelName, kernelString, -1);
+    const bool use_scalar = !(std::abs(scalar - 1.0) < std::numeric_limits<double>::epsilon());
+    arithmetic_run_generic(src1, src2, Scalar::all(scalar), oclMat(), dst, MUL, use_scalar);
 }
+
 void cv::ocl::multiply(double scalar, const oclMat &src, oclMat &dst)
 {
-    string kernelName = "arithm_muls";
-    arithmetic_scalar_run( src, dst, kernelName, &arithm_mul, scalar);
+    arithmetic_run_generic(src, oclMat(), Scalar::all(scalar), oclMat(), dst, MUL);
 }
-void cv::ocl::divide(double scalar, const oclMat &src,  oclMat &dst)
-{
-    if(!src.clCxt->supportsFeature(Context::CL_DOUBLE))
-    {
-        CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
-        return;
-    }
 
-    string kernelName =  "arithm_s_div";
-    arithmetic_scalar_run(src, dst, kernelName, &arithm_div, scalar);
+void cv::ocl::divide(const oclMat &src1, const oclMat &src2, oclMat &dst, double scalar)
+{
+    const bool use_scalar = !(std::abs(scalar - 1.0) < std::numeric_limits<double>::epsilon());
+    arithmetic_run_generic(src1, src2, Scalar::all(scalar), oclMat(), dst, DIV, use_scalar);
 }
+
+void cv::ocl::divide(double scalar, const oclMat &src, oclMat &dst)
+{
+    arithmetic_run_generic(src, oclMat(), Scalar::all(scalar), oclMat(), dst, DIV);
+}
+
 //////////////////////////////////////////////////////////////////////////////
 /////////////////////////////////  Absdiff ///////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////
+
 void cv::ocl::absdiff(const oclMat &src1, const oclMat &src2, oclMat &dst)
 {
-    arithmetic_run(src1, src2, dst, "arithm_absdiff", &arithm_absdiff);
+    arithmetic_run_generic(src1, src2, Scalar(), oclMat(), dst, ABS_DIFF);
 }
+
 void cv::ocl::absdiff(const oclMat &src1, const Scalar &src2, oclMat &dst)
 {
-    string kernelName = "arithm_s_absdiff";
-    oclMat mask;
-    arithmetic_scalar( src1, src2, dst, mask, kernelName, &arithm_absdiff);
+    arithmetic_run_generic(src1, oclMat(), src2, oclMat(), dst, ABS_DIFF);
 }
+
 //////////////////////////////////////////////////////////////////////////////
 /////////////////////////////////  compare ///////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////
diff --git a/modules/ocl/src/opencl/arithm_add.cl b/modules/ocl/src/opencl/arithm_add.cl
index 070ced473..38834e766 100644
--- a/modules/ocl/src/opencl/arithm_add.cl
+++ b/modules/ocl/src/opencl/arithm_add.cl
@@ -52,809 +52,105 @@
 #endif
 #endif
 
-#ifdef ARITHM_ADD
-  #define ARITHM_OP(A,B) ((A)+(B))
-#elif defined ARITHM_SUB
-  #define ARITHM_OP(A,B) ((A)-(B))
-#endif
 //////////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////////////ADD////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-/**************************************add without mask**************************************/
-__kernel void arithm_add_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                             __global uchar *src2, int src2_step, int src2_offset,
-                             __global uchar *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+///////////////////////////////////////////// ADD ////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////////////////////
+
+__kernel void arithm_binary_op_mat(__global T *src1, int src1_step, int src1_offset,
+                         __global T *src2, int src2_step, int src2_offset,
+                         __global T *dst, int dst_step, int dst_offset,
+                         int cols, int rows)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
 
     if (x < cols && y < rows)
     {
-        x = x << 2;
+        int src1_index = mad24(y, src1_step, x + src1_offset);
+        int src2_index = mad24(y, src2_step, x + src2_offset);
+        int dst_index  = mad24(y, dst_step, x + dst_offset);
 
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
-        uchar4 src2_data = vload4(0, src2 + src2_index_fix);
-        if(src1_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-        short4 tmp      = ARITHM_OP(convert_short4_sat(src1_data), convert_short4_sat(src2_data));
-        uchar4 tmp_data = convert_uchar4_sat(tmp);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
+        dst[dst_index] = convertToT(convertToWT(src1[src1_index]) Operation convertToWT(src2[src2_index]));
     }
 }
-__kernel void arithm_add_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                             __global ushort *src2, int src2_step, int src2_offset,
-                             __global ushort *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
 
+__kernel void arithm_binary_op_mat_div(__global T *src1, int src1_step, int src1_offset,
+                         __global T *src2, int src2_step, int src2_offset,
+                         __global T *dst, int dst_step, int dst_offset,
+                         int cols, int rows)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
 
     if (x < cols && y < rows)
     {
-        x = x << 2;
+        int src1_index = mad24(y, src1_step, x + src1_offset);
+        int src2_index = mad24(y, src2_step, x + src2_offset);
+        int dst_index  = mad24(y, dst_step, x + dst_offset);
 
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
-
-        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
-
-        ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
-        int4    tmp = ARITHM_OP(convert_int4_sat(src1_data), convert_int4_sat(src2_data));
-        ushort4 tmp_data = convert_ushort4_sat(tmp);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global ushort4 *)((__global char *)dst + dst_index)) = dst_data;
+        T zero = (T)(0);
+        dst[dst_index] = src2[src2_index] == zero ? zero : convertToT(convertToWT(src1[src1_index]) / convertToWT(src2[src2_index]));
     }
 }
-__kernel void arithm_add_D3 (__global short *src1, int src1_step, int src1_offset,
-                             __global short *src2, int src2_step, int src2_offset,
-                             __global short *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+
+__kernel void arithm_absdiff_mat(__global T *src1, int src1_step, int src1_offset,
+                         __global T *src2, int src2_step, int src2_offset,
+                         __global T *dst, int dst_step, int dst_offset,
+                         int cols, int rows)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
 
     if (x < cols && y < rows)
     {
-        x = x << 2;
+        int src1_index = mad24(y, src1_step, x + src1_offset);
+        int src2_index = mad24(y, src2_step, x + src2_offset);
+        int dst_index  = mad24(y, dst_step, x + dst_offset);
 
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
-
-        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
-
-        short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
-        int4   tmp = ARITHM_OP(convert_int4_sat(src1_data), convert_int4_sat(src2_data));
-        short4 tmp_data = convert_short4_sat(tmp);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = dst_data;
+        WT value = convertToWT(src1[src1_index]) - convertToWT(src2[src2_index]);
+        value = value > (WT)(0) ? value : -value;
+        dst[dst_index] = convertToT(value);
     }
 }
 
-__kernel void arithm_add_D4 (__global int *src1, int src1_step, int src1_offset,
-                             __global int *src2, int src2_step, int src2_offset,
-                             __global int *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+// add mat with scale for multiply
+__kernel void arithm_binary_op_mat_scalar(__global T *src1, int src1_step, int src1_offset,
+                                __global T *src2, int src2_step, int src2_offset,
+                               __global WT *scalar,
+                               __global T *dst, int dst_step,  int dst_offset,
+                               int cols, int rows)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
 
     if (x < cols && y < rows)
     {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+        int src1_index = mad24(y, src1_step, x + src1_offset);
+        int src2_index = mad24(y, src2_step, x + src2_offset);
+        int dst_index = mad24(y, dst_step, x + dst_offset);
 
-        int data1 = *((__global int *)((__global char *)src1 + src1_index));
-        int data2 = *((__global int *)((__global char *)src2 + src2_index));
-        long tmp  = ARITHM_OP((long)(data1), (long)(data2));
-
-        *((__global int *)((__global char *)dst + dst_index)) = convert_int_sat(tmp);
+        dst[dst_index] = convertToT(convertToWT(src1[src1_index]) * scalar[0] * convertToWT(src2[src2_index]));
     }
 }
-__kernel void arithm_add_D5 (__global float *src1, int src1_step, int src1_offset,
-                             __global float *src2, int src2_step, int src2_offset,
-                             __global float *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+
+// add mat with scale for divide
+__kernel void arithm_binary_op_mat_scalar_div(__global T *src1, int src1_step, int src1_offset,
+                                __global T *src2, int src2_step, int src2_offset,
+                               __global WT *scalar,
+                               __global T *dst, int dst_step,  int dst_offset,
+                               int cols, int rows)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
 
     if (x < cols && y < rows)
     {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+        int src1_index = mad24(y, src1_step, x + src1_offset);
+        int src2_index = mad24(y, src2_step, x + src2_offset);
+        int dst_index = mad24(y, dst_step, x + dst_offset);
 
-        float data1 = *((__global float *)((__global char *)src1 + src1_index));
-        float data2 = *((__global float *)((__global char *)src2 + src2_index));
-        float tmp = ARITHM_OP(data1, data2);
-
-        *((__global float *)((__global char *)dst + dst_index)) = tmp;
+        T zero = (T)(0);
+        dst[dst_index] = src2[src2_index] == zero ? zero :
+            convertToT(convertToWT(src1[src1_index]) * scalar[0] / convertToWT(src2[src2_index]));
     }
 }
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_add_D6 (__global double *src1, int src1_step, int src1_offset,
-                             __global double *src2, int src2_step, int src2_offset,
-                             __global double *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        double data1 = *((__global double *)((__global char *)src1 + src1_index));
-        double data2 = *((__global double *)((__global char *)src2 + src2_index));
-
-        *((__global double *)((__global char *)dst + dst_index)) = ARITHM_OP(data1, data2);
-    }
-}
-#endif
-
-/**************************************add with mask**************************************/
-__kernel void arithm_add_with_mask_C1_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                          __global uchar *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global uchar *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        int mask_index_fix = mask_index < 0 ? 0 : mask_index;
-        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
-        uchar4 src2_data = vload4(0, src2 + src2_index_fix);
-        uchar4 mask_data = vload4(0, mask + mask_index_fix);
-        if(src1_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-        if(mask_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (mask_index == -2) ? mask_data.zwxy:mask_data.yzwx;
-            mask_data.xyzw = (mask_index == -1) ? mask_data.wxyz:tmp.xyzw;
-        }
-
-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        short4 tmp = ARITHM_OP(convert_short4_sat(src1_data), convert_short4_sat(src2_data));
-        uchar4 tmp_data = convert_uchar4_sat(tmp);
-
-        data.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
-        data.y = ((mask_data.y) && (dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
-        data.z = ((mask_data.z) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
-        data.w = ((mask_data.w) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_add_with_mask_C1_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                          __global ushort *src2, int src2_step, int src2_offset,
-                                          __global uchar  *mask, int mask_step, int mask_offset,
-                                          __global ushort *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort2 src2_data = vload2(0, (__global ushort *)((__global char *)src2 + src2_index));
-        uchar2  mask_data = vload2(0, mask + mask_index);
-
-        ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
-        int2    tmp = ARITHM_OP(convert_int2_sat(src1_data), convert_int2_sat(src2_data));
-        ushort2 tmp_data = convert_ushort2_sat(tmp);
-
-        data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x;
-        data.y = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.y : data.y;
-
-        *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_add_with_mask_C1_D3 (__global short *src1, int src1_step, int src1_offset,
-                                          __global short *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global short *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index));
-        short2 src2_data = vload2(0, (__global short *)((__global char *)src2 + src2_index));
-        uchar2  mask_data = vload2(0, mask + mask_index);
-
-        short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
-        int2    tmp = ARITHM_OP(convert_int2_sat(src1_data), convert_int2_sat(src2_data));
-        short2 tmp_data = convert_short2_sat(tmp);
-
-        data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x;
-        data.y = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.y : data.y;
-
-        *((__global short2 *)((__global uchar *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_add_with_mask_C1_D4 (__global int   *src1, int src1_step, int src1_offset,
-                                          __global int   *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global int   *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        int src_data1 = *((__global int *)((__global char *)src1 + src1_index));
-        int src_data2 = *((__global int *)((__global char *)src2 + src2_index));
-        int dst_data  = *((__global int *)((__global char *)dst  + dst_index));
-
-        int data = convert_int_sat(ARITHM_OP((long)src_data1, (long)src_data2));
-        data = mask_data ? data : dst_data;
-
-        *((__global int *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-__kernel void arithm_add_with_mask_C1_D5 (__global float *src1, int src1_step, int src1_offset,
-                                          __global float *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global float *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        float src_data1 = *((__global float *)((__global char *)src1 + src1_index));
-        float src_data2 = *((__global float *)((__global char *)src2 + src2_index));
-        float dst_data  = *((__global float *)((__global char *)dst  + dst_index));
-
-        float data = ARITHM_OP(src_data1, src_data2);
-        data = mask_data ? data : dst_data;
-
-        *((__global float *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_add_with_mask_C1_D6 (__global double *src1, int src1_step, int src1_offset,
-                                          __global double *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global double *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        double src_data1 = *((__global double *)((__global char *)src1 + src1_index));
-        double src_data2 = *((__global double *)((__global char *)src2 + src2_index));
-        double dst_data  = *((__global double *)((__global char *)dst  + dst_index));
-
-        double data = ARITHM_OP(src_data1, src_data2);
-        data = mask_data ? data : dst_data;
-
-        *((__global double *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#endif
-
-__kernel void arithm_add_with_mask_C2_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                          __global uchar *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global uchar *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        uchar4 src1_data = vload4(0, src1 + src1_index);
-        uchar4 src2_data = vload4(0, src2 + src2_index);
-        uchar2 mask_data = vload2(0, mask + mask_index);
-
-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        short4   tmp = ARITHM_OP(convert_short4_sat(src1_data), convert_short4_sat(src2_data));
-        uchar4 tmp_data = convert_uchar4_sat(tmp);
-
-        data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy;
-        data.zw = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.zw : data.zw;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_add_with_mask_C2_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                          __global ushort *src2, int src2_step, int src2_offset,
-                                          __global uchar  *mask, int mask_step, int mask_offset,
-                                          __global ushort *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index));
-        ushort2 src_data2 = *((__global ushort2 *)((__global char *)src2 + src2_index));
-        ushort2 dst_data  = *((__global ushort2 *)((__global char *)dst  + dst_index));
-
-        int2    tmp = ARITHM_OP(convert_int2_sat(src_data1), convert_int2_sat(src_data2));
-        ushort2 data = convert_ushort2_sat(tmp);
-        data = mask_data ? data : dst_data;
-
-        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_add_with_mask_C2_D3 (__global short *src1, int src1_step, int src1_offset,
-                                          __global short *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global short *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index));
-        short2 src_data2 = *((__global short2 *)((__global char *)src2 + src2_index));
-        short2 dst_data  = *((__global short2 *)((__global char *)dst  + dst_index));
-
-        int2    tmp = ARITHM_OP(convert_int2_sat(src_data1), convert_int2_sat(src_data2));
-        short2 data = convert_short2_sat(tmp);
-        data = mask_data ? data : dst_data;
-
-        *((__global short2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_add_with_mask_C2_D4 (__global int   *src1, int src1_step, int src1_offset,
-                                          __global int   *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global int    *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = *((__global int2 *)((__global char *)src2 + src2_index));
-        int2 dst_data  = *((__global int2 *)((__global char *)dst  + dst_index));
-
-        int2 data = convert_int2_sat(ARITHM_OP(convert_long2_sat(src_data1), convert_long2_sat(src_data2)));
-        data = mask_data ? data : dst_data;
-
-        *((__global int2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_add_with_mask_C2_D5 (__global float *src1, int src1_step, int src1_offset,
-                                          __global float *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global float *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        float2 src_data1 = *((__global float2 *)((__global char *)src1 + src1_index));
-        float2 src_data2 = *((__global float2 *)((__global char *)src2 + src2_index));
-        float2 dst_data  = *((__global float2 *)((__global char *)dst  + dst_index));
-
-        float2 data = ARITHM_OP(src_data1, src_data2);
-        data = mask_data ? data : dst_data;
-
-        *((__global float2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_add_with_mask_C2_D6 (__global double *src1, int src1_step, int src1_offset,
-                                          __global double *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global double *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 4) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        double2 src_data1 = *((__global double2 *)((__global char *)src1 + src1_index));
-        double2 src_data2 = *((__global double2 *)((__global char *)src2 + src2_index));
-        double2 dst_data  = *((__global double2 *)((__global char *)dst  + dst_index));
-
-        double2 data = ARITHM_OP(src_data1, src_data2);
-        data = mask_data ? data : dst_data;
-
-        *((__global double2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#endif
-
-__kernel void arithm_add_with_mask_C4_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                          __global uchar *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global uchar *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index));
-        uchar4 src_data2 = *((__global uchar4 *)(src2 + src2_index));
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
-
-        uchar4 data = convert_uchar4_sat(ARITHM_OP(convert_short4_sat(src_data1), convert_short4_sat(src_data2)));
-        data = mask_data ? data : dst_data;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_add_with_mask_C4_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                          __global ushort *src2, int src2_step, int src2_offset,
-                                          __global uchar  *mask, int mask_step, int mask_offset,
-                                          __global ushort *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index));
-        ushort4 src_data2 = *((__global ushort4 *)((__global char *)src2 + src2_index));
-        ushort4 dst_data  = *((__global ushort4 *)((__global char *)dst  + dst_index));
-
-        ushort4 data = convert_ushort4_sat(ARITHM_OP(convert_int4_sat(src_data1), convert_int4_sat(src_data2)));
-        data = mask_data ? data : dst_data;
-
-        *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_add_with_mask_C4_D3 (__global short *src1, int src1_step, int src1_offset,
-                                          __global short *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global short *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index));
-        short4 src_data2 = *((__global short4 *)((__global char *)src2 + src2_index));
-        short4 dst_data  = *((__global short4 *)((__global char *)dst  + dst_index));
-
-        short4 data = convert_short4_sat(ARITHM_OP(convert_int4_sat(src_data1), convert_int4_sat(src_data2)));
-        data = mask_data ? data : dst_data;
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_add_with_mask_C4_D4 (__global int   *src1, int src1_step, int src1_offset,
-                                          __global int   *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global int   *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 4) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index));
-        int4 src_data2 = *((__global int4 *)((__global char *)src2 + src2_index));
-        int4 dst_data  = *((__global int4 *)((__global char *)dst  + dst_index));
-
-        int4 data = convert_int4_sat(ARITHM_OP(convert_long4_sat(src_data1), convert_long4_sat(src_data2)));
-        data = mask_data ? data : dst_data;
-
-        *((__global int4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_add_with_mask_C4_D5 (__global float *src1, int src1_step, int src1_offset,
-                                          __global float *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global float *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 4) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        float4 src_data1 = *((__global float4 *)((__global char *)src1 + src1_index));
-        float4 src_data2 = *((__global float4 *)((__global char *)src2 + src2_index));
-        float4 dst_data  = *((__global float4 *)((__global char *)dst  + dst_index));
-
-        float4 data = ARITHM_OP(src_data1, src_data2);
-        data = mask_data ? data : dst_data;
-
-        *((__global float4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_add_with_mask_C4_D6 (__global double *src1, int src1_step, int src1_offset,
-                                          __global double *src2, int src2_step, int src2_offset,
-                                          __global uchar  *mask, int mask_step, int mask_offset,
-                                          __global double *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 5) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 5) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 5) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        double4 src_data1 = *((__global double4 *)((__global char *)src1 + src1_index));
-        double4 src_data2 = *((__global double4 *)((__global char *)src2 + src2_index));
-        double4 dst_data  = *((__global double4 *)((__global char *)dst  + dst_index));
-
-        double4 data = ARITHM_OP(src_data1, src_data2);
-        data = mask_data ? data : dst_data;
-
-        *((__global double4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#endif
diff --git a/modules/ocl/src/opencl/arithm_add_mask.cl b/modules/ocl/src/opencl/arithm_add_mask.cl
new file mode 100644
index 000000000..52dbfc455
--- /dev/null
+++ b/modules/ocl/src/opencl/arithm_add_mask.cl
@@ -0,0 +1,79 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
+#endif
+
+//////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////// add with mask //////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////
+
+__kernel void arithm_binary_op_mat_mask(__global T * src1, int src1_step, int src1_offset,
+                              __global T * src2, int src2_step, int src2_offset,
+                              __global uchar * mask, int mask_step, int mask_offset,
+                              __global T * dst, int dst_step, int dst_offset,
+                              int cols, int rows)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int mask_index = mad24(y, mask_step, x + mask_offset);
+        if (mask[mask_index])
+        {
+            int src1_index = mad24(y, src1_step, x + src1_offset);
+            int src2_index = mad24(y, src2_step, x + src2_offset);
+            int dst_index  = mad24(y, dst_step, dst_offset + x);
+
+            dst[dst_index] = convertToT(convertToWT(src1[src1_index]) Operation convertToWT(src2[src2_index]));
+        }
+    }
+}
diff --git a/modules/ocl/src/opencl/arithm_add_scalar.cl b/modules/ocl/src/opencl/arithm_add_scalar.cl
index cdb79f37e..4e0c7fc5f 100644
--- a/modules/ocl/src/opencl/arithm_add_scalar.cl
+++ b/modules/ocl/src/opencl/arithm_add_scalar.cl
@@ -51,463 +51,61 @@
 #endif
 #endif
 
-#ifdef ARITHM_ADD
-  #define ARITHM_OP(A,B) ((A)+(B))
-#elif defined ARITHM_SUB
-  #define ARITHM_OP(A,B) ((A)-(B))
-#endif
-/**************************************add with scalar without mask**************************************/
-__kernel void arithm_s_add_C1_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                  __global   uchar *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
+///////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////// Add with scalar /////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////
+
+__kernel void arithm_binary_op_scalar (__global T *src1, int src1_step, int src1_offset,
+                                 __global WT *scalar,
+                                 __global T *dst,  int dst_step,  int dst_offset,
+                                 int cols, int rows)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
 
     if (x < cols && y < rows)
     {
-        x = x << 2;
+        int src1_index = mad24(y, src1_step, x + src1_offset);
+        int dst_index = mad24(y, dst_step, x + dst_offset);
 
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
-        int4 src2_data = (int4)(src2.x, src2.x, src2.x, src2.x);
-        if(src1_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-
-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        int4 tmp = ARITHM_OP(convert_int4_sat(src1_data), src2_data);
-        uchar4 tmp_data = convert_uchar4_sat(tmp);
-
-        data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
-        data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
-        data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
-        data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
+        dst[dst_index] = convertToT(convertToWT(src1[src1_index]) Operation scalar[0]);
     }
 }
-__kernel void arithm_s_add_C1_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                  __global   ushort *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
-{
 
+__kernel void arithm_absdiff_scalar(__global T *src1, int src1_step, int src1_offset,
+                         __global WT *src2,
+                         __global T *dst, int dst_step, int dst_offset,
+                         int cols, int rows)
+{
     int x = get_global_id(0);
     int y = get_global_id(1);
 
     if (x < cols && y < rows)
     {
-        x = x << 1;
+        int src1_index = mad24(y, src1_step, x + src1_offset);
+        int dst_index  = mad24(y, dst_step, x + dst_offset);
 
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index));
-        int2 src2_data = (int2)(src2.x, src2.x);
-
-        ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
-        int2    tmp = ARITHM_OP(convert_int2_sat(src1_data), src2_data);
-        ushort2 tmp_data = convert_ushort2_sat(tmp);
-
-        data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x;
-        data.y = (dst_index + 2 <  dst_end  ) ? tmp_data.y : data.y;
-
-        *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
+        WT value = convertToWT(src1[src1_index]) - src2[0];
+        value = value > (WT)(0) ? value : -value;
+        dst[dst_index] = convertToT(value);
     }
 }
-__kernel void arithm_s_add_C1_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                  __global   short *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
-{
 
+// scalar divide to matrix
+__kernel void arithm_binary_op_scalar_div(__global T *src1, int src1_step, int src1_offset,
+                               __global WT *scalar,
+                               __global T *dst,  int dst_step,  int dst_offset,
+                               int cols, int rows)
+{
     int x = get_global_id(0);
     int y = get_global_id(1);
 
     if (x < cols && y < rows)
     {
-        x = x << 1;
+        int src1_index = mad24(y, src1_step, x + src1_offset);
+        int dst_index = mad24(y, dst_step, x + dst_offset);
 
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index));
-        int2 src2_data = (int2)(src2.x, src2.x);
-        short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
-
-        int2    tmp = ARITHM_OP(convert_int2_sat(src1_data), src2_data);
-        short2 tmp_data = convert_short2_sat(tmp);
-
-        data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x;
-        data.y = (dst_index + 2 <  dst_end  ) ? tmp_data.y : data.y;
-
-        *((__global short2 *)((__global uchar *)dst + dst_index)) = data;
+        T zero = (T)(0);
+        dst[dst_index] = src1[src1_index] == zero ? zero : convertToT(scalar[0] / convertToWT(src1[src1_index]));
     }
 }
-__kernel void arithm_s_add_C1_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                  __global   int *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        int src_data1 = *((__global int *)((__global char *)src1 + src1_index));
-        int src_data2 = src2.x;
-        int dst_data  = *((__global int *)((__global char *)dst  + dst_index));
-
-        int data = convert_int_sat(ARITHM_OP((long)src_data1, (long)src_data2));
-
-        *((__global int *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_C1_D5 (__global   float *src1, int src1_step, int src1_offset,
-                                  __global   float *dst,  int dst_step,  int dst_offset,
-                                  float4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        float src_data1 = *((__global float *)((__global char *)src1 + src1_index));
-        float src_data2 = src2.x;
-        float dst_data  = *((__global float *)((__global char *)dst  + dst_index));
-
-        float data = ARITHM_OP(src_data1, src_data2);
-
-        *((__global float *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_add_C1_D6 (__global   double *src1, int src1_step, int src1_offset,
-                                  __global   double *dst,  int dst_step,  int dst_offset,
-                                  double4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        double src_data1 = *((__global double *)((__global char *)src1 + src1_index));
-        double src2_data = src2.x;
-        double dst_data  = *((__global double *)((__global char *)dst  + dst_index));
-
-        double data = ARITHM_OP(src_data1, src2_data);
-
-        *((__global double *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#endif
-
-__kernel void arithm_s_add_C2_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                  __global   uchar *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        uchar4 src1_data = vload4(0, src1 + src1_index);
-        int4 src2_data = (int4)(src2.x, src2.y, src2.x, src2.y);
-
-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        int4 tmp = ARITHM_OP(convert_int4_sat(src1_data), src2_data);
-        uchar4 tmp_data = convert_uchar4_sat(tmp);
-
-        data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy;
-        data.zw = (dst_index + 2 <  dst_end  ) ? tmp_data.zw : data.zw;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_C2_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                  __global   ushort *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = (int2)(src2.x, src2.y);
-        ushort2 dst_data  = *((__global ushort2 *)((__global char *)dst  + dst_index));
-
-        int2    tmp = ARITHM_OP(convert_int2_sat(src_data1), src_data2);
-        ushort2 data = convert_ushort2_sat(tmp);
-
-        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_C2_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                  __global   short *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = (int2)(src2.x, src2.y);
-        short2 dst_data  = *((__global short2 *)((__global char *)dst  + dst_index));
-
-        int2    tmp = ARITHM_OP(convert_int2_sat(src_data1), src_data2);
-        short2 data = convert_short2_sat(tmp);
-
-        *((__global short2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_C2_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                  __global   int *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = (int2)(src2.x, src2.y);
-        int2 dst_data  = *((__global int2 *)((__global char *)dst  + dst_index));
-
-        int2 data = convert_int2_sat(ARITHM_OP(convert_long2_sat(src_data1), convert_long2_sat(src_data2)));
-        *((__global int2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_C2_D5 (__global   float *src1, int src1_step, int src1_offset,
-                                  __global   float *dst,  int dst_step,  int dst_offset,
-                                  float4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        float2 src_data1 = *((__global float2 *)((__global char *)src1 + src1_index));
-        float2 src_data2 = (float2)(src2.x, src2.y);
-        float2 dst_data  = *((__global float2 *)((__global char *)dst  + dst_index));
-
-        float2 data = ARITHM_OP(src_data1, src_data2);
-        *((__global float2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_add_C2_D6 (__global   double *src1, int src1_step, int src1_offset,
-                                  __global   double *dst,  int dst_step,  int dst_offset,
-                                  double4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        double2 src_data1 = *((__global double2 *)((__global char *)src1 + src1_index));
-        double2 src_data2 = (double2)(src2.x, src2.y);
-        double2 dst_data  = *((__global double2 *)((__global char *)dst  + dst_index));
-
-        double2 data = ARITHM_OP(src_data1, src_data2);
-
-        *((__global double2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#endif
-
-__kernel void arithm_s_add_C4_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                  __global   uchar *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index));
-
-        uchar4 data = convert_uchar4_sat(ARITHM_OP(convert_int4_sat(src_data1), src2));
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_C4_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                  __global   ushort *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index));
-
-        ushort4 data = convert_ushort4_sat(ARITHM_OP(convert_int4_sat(src_data1), src2));
-
-        *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_C4_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                  __global   short *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index));
-
-        short4 data = convert_short4_sat(ARITHM_OP(convert_int4_sat(src_data1), src2));
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_C4_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                  __global   int *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index));
-
-        int4 data = convert_int4_sat(ARITHM_OP(convert_long4_sat(src_data1), convert_long4_sat(src2)));
-
-        *((__global int4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_C4_D5 (__global   float *src1, int src1_step, int src1_offset,
-                                  __global   float *dst,  int dst_step,  int dst_offset,
-                                  float4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        float4 src_data1 = *((__global float4 *)((__global char *)src1 + src1_index));
-
-        float4 data = ARITHM_OP(src_data1, src2);
-
-        *((__global float4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_add_C4_D6 (__global   double *src1, int src1_step, int src1_offset,
-                                  __global   double *dst,  int dst_step,  int dst_offset,
-                                  double4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 5) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 5) + dst_offset);
-
-        double4 src_data1 = *((__global double4 *)((__global char *)src1 + src1_index));
-
-        double4 data = ARITHM_OP(src_data1, src2);
-
-        *((__global double4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#endif
diff --git a/modules/ocl/src/opencl/arithm_add_scalar_mask.cl b/modules/ocl/src/opencl/arithm_add_scalar_mask.cl
index a0cb7dacb..5c3408034 100644
--- a/modules/ocl/src/opencl/arithm_add_scalar_mask.cl
+++ b/modules/ocl/src/opencl/arithm_add_scalar_mask.cl
@@ -51,561 +51,28 @@
 #endif
 #endif
 
-#ifdef ARITHM_ADD
-  #define ARITHM_OP(A,B) ((A)+(B))
-#elif defined ARITHM_SUB
-  #define ARITHM_OP(A,B) ((A)-(B))
-#endif
-/**************************************add with scalar with mask**************************************/
-__kernel void arithm_s_add_with_mask_C1_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                            __global   uchar *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
-{
+///////////////////////////////////////////////////////////////////////////////////
+//////////////////////////// Add with scalar with mask ////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////
 
+__kernel void arithm_binary_op_scalar_mask(__global T *src1, int src1_step, int src1_offset,
+                                     __global WT *scalar,
+                                     __global uchar *mask, int mask_step, int mask_offset,
+                                     __global T *dst,  int dst_step,  int dst_offset,
+                                     int cols, int rows)
+{
     int x = get_global_id(0);
     int y = get_global_id(1);
 
     if (x < cols && y < rows)
     {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int mask_index_fix = mask_index < 0 ? 0 : mask_index;
-        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
-        int4 src2_data = (int4)(src2.x, src2.x, src2.x, src2.x);
-        uchar4 mask_data = vload4(0, mask + mask_index_fix);
-        if(src1_index < 0)
+        int mask_index = mad24(y, mask_step, x + mask_offset);
+        if (mask[mask_index])
         {
-            uchar4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+            int src1_index = mad24(y, src1_step, x + src1_offset);
+            int dst_index = mad24(y, dst_step, dst_offset + x);
+
+            dst[dst_index] = convertToT(convertToWT(src1[src1_index]) Operation scalar[0]);
         }
-        if(mask_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (mask_index == -2) ? mask_data.zwxy:mask_data.yzwx;
-            mask_data.xyzw = (mask_index == -1) ? mask_data.wxyz:tmp.xyzw;
-        }
-
-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        int4 tmp = ARITHM_OP(convert_int4_sat(src1_data), src2_data);
-        uchar4 tmp_data = convert_uchar4_sat(tmp);
-
-        data.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
-        data.y = ((mask_data.y) && (dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
-        data.z = ((mask_data.z) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
-        data.w = ((mask_data.w) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
     }
 }
-__kernel void arithm_s_add_with_mask_C1_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                            __global   ushort *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar  *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index));
-        int2 src2_data = (int2)(src2.x, src2.x);
-        uchar2  mask_data = vload2(0, mask + mask_index);
-
-        ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
-        int2    tmp = ARITHM_OP(convert_int2_sat(src1_data), src2_data);
-        ushort2 tmp_data = convert_ushort2_sat(tmp);
-
-        data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x;
-        data.y = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.y : data.y;
-
-        *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_with_mask_C1_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                            __global   short *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index));
-        int2 src2_data = (int2)(src2.x, src2.x);
-        uchar2  mask_data = vload2(0, mask + mask_index);
-
-        short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
-        int2    tmp = ARITHM_OP(convert_int2_sat(src1_data), src2_data);
-        short2 tmp_data = convert_short2_sat(tmp);
-
-        data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x;
-        data.y = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.y : data.y;
-
-        *((__global short2 *)((__global uchar *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_with_mask_C1_D4 (__global   int   *src1, int src1_step, int src1_offset,
-                                            __global   int   *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        int src_data1 = *((__global int *)((__global char *)src1 + src1_index));
-        int src_data2 = src2.x;
-        int dst_data  = *((__global int *)((__global char *)dst  + dst_index));
-
-        int data = convert_int_sat(ARITHM_OP((long)src_data1, (long)src_data2));
-        data = mask_data ? data : dst_data;
-
-        *((__global int *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-__kernel void arithm_s_add_with_mask_C1_D5 (__global   float   *src1, int src1_step, int src1_offset,
-                                            __global   float   *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            float4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        float src_data1 = *((__global float *)((__global char *)src1 + src1_index));
-        float src_data2 = src2.x;
-        float dst_data  = *((__global float *)((__global char *)dst  + dst_index));
-
-        float data = ARITHM_OP(src_data1, src_data2);
-        data = mask_data ? data : dst_data;
-
-        *((__global float *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_add_with_mask_C1_D6 (__global   double   *src1, int src1_step, int src1_offset,
-                                            __global   double   *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            double4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        double src_data1 = *((__global double *)((__global char *)src1 + src1_index));
-        double src_data2 = src2.x;
-        double dst_data  = *((__global double *)((__global char *)dst  + dst_index));
-
-        double data = ARITHM_OP(src_data1, src_data2);
-        data = mask_data ? data : dst_data;
-
-        *((__global double *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#endif
-__kernel void arithm_s_add_with_mask_C2_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                            __global   uchar *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        uchar4 src1_data = vload4(0, src1 + src1_index);
-        int4   src2_data = (int4)(src2.x, src2.y, src2.x, src2.y);
-        uchar2 mask_data = vload2(0, mask + mask_index);
-
-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        int4  tmp = ARITHM_OP(convert_int4_sat(src1_data), src2_data);
-        uchar4 tmp_data = convert_uchar4_sat(tmp);
-
-        data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy;
-        data.zw = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.zw : data.zw;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_with_mask_C2_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                            __global   ushort *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = (int2)(src2.x, src2.y);
-        ushort2 dst_data  = *((__global ushort2 *)((__global char *)dst  + dst_index));
-
-        int2    tmp = ARITHM_OP(convert_int2_sat(src_data1), src_data2);
-        ushort2 data = convert_ushort2_sat(tmp);
-        data = mask_data ? data : dst_data;
-
-        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_with_mask_C2_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                            __global   short *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = (int2)(src2.x, src2.y);
-        short2 dst_data  = *((__global short2 *)((__global char *)dst  + dst_index));
-
-        int2    tmp = ARITHM_OP(convert_int2_sat(src_data1), src_data2);
-        short2 data = convert_short2_sat(tmp);
-        data = mask_data ? data : dst_data;
-
-        *((__global short2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_with_mask_C2_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                            __global   int *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = (int2)(src2.x, src2.y);
-        int2 dst_data  = *((__global int2 *)((__global char *)dst  + dst_index));
-
-        int2 data = convert_int2_sat(ARITHM_OP(convert_long2_sat(src_data1), convert_long2_sat(src_data2)));
-        data = mask_data ? data : dst_data;
-
-        *((__global int2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_with_mask_C2_D5 (__global   float *src1, int src1_step, int src1_offset,
-                                            __global   float *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            float4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        float2 src_data1 = *((__global float2 *)((__global char *)src1 + src1_index));
-        float2 src_data2 = (float2)(src2.x, src2.y);
-        float2 dst_data  = *((__global float2 *)((__global char *)dst  + dst_index));
-
-        float2 data = ARITHM_OP(src_data1, src_data2);
-        data = mask_data ? data : dst_data;
-
-        *((__global float2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_add_with_mask_C2_D6 (__global   double *src1, int src1_step, int src1_offset,
-                                            __global   double *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            double4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        double2 src_data1 = *((__global double2 *)((__global char *)src1 + src1_index));
-        double2 src_data2 = (double2)(src2.x, src2.y);
-        double2 dst_data  = *((__global double2 *)((__global char *)dst  + dst_index));
-
-        double2 data = ARITHM_OP(src_data1, src_data2);
-        data = mask_data ? data : dst_data;
-
-        *((__global double2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#endif
-
-__kernel void arithm_s_add_with_mask_C4_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                            __global   uchar *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index));
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
-
-        uchar4 data = convert_uchar4_sat(ARITHM_OP(convert_int4_sat(src_data1), src2));
-        data = mask_data ? data : dst_data;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_with_mask_C4_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                            __global   ushort *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index));
-        ushort4 dst_data  = *((__global ushort4 *)((__global char *)dst  + dst_index));
-
-        ushort4 data = convert_ushort4_sat(ARITHM_OP(convert_int4_sat(src_data1), src2));
-        data = mask_data ? data : dst_data;
-
-        *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_with_mask_C4_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                            __global   short *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index));
-        short4 dst_data  = *((__global short4 *)((__global char *)dst  + dst_index));
-
-        short4 data = convert_short4_sat(ARITHM_OP(convert_int4_sat(src_data1), src2));
-        data = mask_data ? data : dst_data;
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_with_mask_C4_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                            __global   int *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index));
-        int4 dst_data  = *((__global int4 *)((__global char *)dst  + dst_index));
-
-        int4 data = convert_int4_sat(ARITHM_OP(convert_long4_sat(src_data1), convert_long4_sat(src2)));
-        data = mask_data ? data : dst_data;
-
-        *((__global int4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_with_mask_C4_D5 (__global   float *src1, int src1_step, int src1_offset,
-                                            __global   float *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            float4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        float4 src_data1 = *((__global float4 *)((__global char *)src1 + src1_index));
-        float4 dst_data  = *((__global float4 *)((__global char *)dst  + dst_index));
-
-        float4 data = ARITHM_OP(src_data1, src2);
-        data = mask_data ? data : dst_data;
-
-        *((__global float4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_add_with_mask_C4_D6 (__global   double *src1, int src1_step, int src1_offset,
-                                            __global   double *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            double4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 5) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 5) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        double4 src_data1 = *((__global double4 *)((__global char *)src1 + src1_index));
-        double4 dst_data  = *((__global double4 *)((__global char *)dst  + dst_index));
-
-        double4 data = ARITHM_OP(src_data1, src2);
-        data = mask_data ? data : dst_data;
-
-        *((__global double4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#endif
diff --git a/modules/ocl/src/opencl/arithm_div.cl b/modules/ocl/src/opencl/arithm_div.cl
deleted file mode 100644
index 1dce3853f..000000000
--- a/modules/ocl/src/opencl/arithm_div.cl
+++ /dev/null
@@ -1,468 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other oclMaterials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if defined (DOUBLE_SUPPORT)
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#elif defined (cl_amd_fp64)
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#endif
-typedef double F ;
-typedef double4 F4;
-#define convert_F4 convert_double4
-#define convert_F  double
-#else
-typedef float F;
-typedef float4 F4;
-#define convert_F4 convert_float4
-#define convert_F  float
-#endif
-
-inline uchar round2_uchar(F v)
-{
-    return convert_uchar_sat(round(v));
-}
-
-inline ushort round2_ushort(F v)
-{
-    return convert_ushort_sat(round(v));
-}
-
-inline short round2_short(F v)
-{
-    return convert_short_sat(round(v));
-}
-
-inline int round2_int(F v)
-{
-    return convert_int_sat(round(v));
-}
-///////////////////////////////////////////////////////////////////////////////////////
-////////////////////////////divide///////////////////////////////////////////////////
-//////////////////////////////////////////////////////////////////////////////////////
-/**********************************div*********************************************/
-__kernel void arithm_div_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                             __global uchar *src2, int src2_step, int src2_offset,
-                             __global uchar *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1, F scalar)
-{
-    int2 coor = (int2)(get_global_id(0), get_global_id(1));
-
-    if (coor.x < cols && coor.y < rows)
-    {
-        coor.x = coor.x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int2 src_index = (int2)(mad24(coor.y, src1_step, coor.x + src1_offset - dst_align),
-                                mad24(coor.y, src2_step, coor.x + src2_offset - dst_align));
-
-        int4 dst_args  = (int4)(mad24(coor.y, dst_step, dst_offset),
-                                mad24(coor.y, dst_step, dst_offset + dst_step1),
-                                mad24(coor.y, dst_step, dst_offset + coor.x & (int)0xfffffffc),
-                                0);
-
-        uchar4 src1_data = vload4(0, src1 + src_index.x);
-        uchar4 src2_data = vload4(0, src2 + src_index.y);
-        uchar4 dst_data  = *((__global uchar4 *)(dst + dst_args.z));
-
-        F4 tmp      = convert_F4(src1_data) * scalar;
-        uchar4 tmp_data;
-        tmp_data.x = ((tmp.x == 0) || (src2_data.x == 0)) ? 0 : round2_uchar(tmp.x / src2_data.x);
-        tmp_data.y = ((tmp.y == 0) || (src2_data.y == 0)) ? 0 : round2_uchar(tmp.y / src2_data.y);
-        tmp_data.z = ((tmp.z == 0) || (src2_data.z == 0)) ? 0 : round2_uchar(tmp.z / src2_data.z);
-        tmp_data.w = ((tmp.w == 0) || (src2_data.w == 0)) ? 0 : round2_uchar(tmp.w / src2_data.w);
-
-        dst_data.x = ((dst_args.z + 0 >= dst_args.x) && (dst_args.z + 0 < dst_args.y)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_args.z + 1 >= dst_args.x) && (dst_args.z + 1 < dst_args.y)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_args.z + 2 >= dst_args.x) && (dst_args.z + 2 < dst_args.y)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_args.z + 3 >= dst_args.x) && (dst_args.z + 3 < dst_args.y)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_args.z)) = dst_data;
-    }
-}
-
-__kernel void arithm_div_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                             __global ushort *src2, int src2_step, int src2_offset,
-                             __global ushort *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1, F scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
-
-        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
-        ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
-
-        F4 tmp   = convert_F4(src1_data) * scalar;
-
-        ushort4 tmp_data;
-        tmp_data.x = ((tmp.x == 0) || (src2_data.x == 0)) ? 0 : round2_ushort(tmp.x / (F)src2_data.x);
-        tmp_data.y = ((tmp.y == 0) || (src2_data.y == 0)) ? 0 : round2_ushort(tmp.y / (F)src2_data.y);
-        tmp_data.z = ((tmp.z == 0) || (src2_data.z == 0)) ? 0 : round2_ushort(tmp.z / (F)src2_data.z);
-        tmp_data.w = ((tmp.w == 0) || (src2_data.w == 0)) ? 0 : round2_ushort(tmp.w / (F)src2_data.w);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global ushort4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-}
-__kernel void arithm_div_D3 (__global short *src1, int src1_step, int src1_offset,
-                             __global short *src2, int src2_step, int src2_offset,
-                             __global short *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1, F scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
-
-        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
-        short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
-
-        F4 tmp   = convert_F4(src1_data) * scalar;
-
-        short4 tmp_data;
-        tmp_data.x = ((tmp.x == 0) || (src2_data.x == 0)) ? 0 : round2_short(tmp.x / (F)src2_data.x);
-        tmp_data.y = ((tmp.y == 0) || (src2_data.y == 0)) ? 0 : round2_short(tmp.y / (F)src2_data.y);
-        tmp_data.z = ((tmp.z == 0) || (src2_data.z == 0)) ? 0 : round2_short(tmp.z / (F)src2_data.z);
-        tmp_data.w = ((tmp.w == 0) || (src2_data.w == 0)) ? 0 : round2_short(tmp.w / (F)src2_data.w);
-
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-}
-
-__kernel void arithm_div_D4 (__global int *src1, int src1_step, int src1_offset,
-                             __global int *src2, int src2_step, int src2_offset,
-                             __global int *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1, F scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        int data1 = *((__global int *)((__global char *)src1 + src1_index));
-        int data2 = *((__global int *)((__global char *)src2 + src2_index));
-
-        F tmp  = (convert_F)(data1) * scalar;
-        int tmp_data = (tmp == 0 || data2 == 0) ? 0 : round2_int(tmp / (convert_F)(data2));
-
-        *((__global int *)((__global char *)dst + dst_index)) =tmp_data;
-    }
-}
-
-__kernel void arithm_div_D5 (__global float *src1, int src1_step, int src1_offset,
-                             __global float *src2, int src2_step, int src2_offset,
-                             __global float *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1, F scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        float data1 = *((__global float *)((__global char *)src1 + src1_index));
-        float data2 = *((__global float *)((__global char *)src2 + src2_index));
-
-        F tmp  = (convert_F)(data1) * scalar;
-        float tmp_data = (tmp == 0 || data2 == 0) ? 0 : convert_float(tmp / (convert_F)(data2));
-
-        *((__global float *)((__global char *)dst + dst_index)) = tmp_data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_div_D6 (__global double *src1, int src1_step, int src1_offset,
-                             __global double *src2, int src2_step, int src2_offset,
-                             __global double *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1, double scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        double data1 = *((__global double *)((__global char *)src1 + src1_index));
-        double data2 = *((__global double *)((__global char *)src2 + src2_index));
-
-        double tmp  = data1 * scalar;
-        double tmp_data = (tmp == 0 || data2 == 0) ? 0 : (tmp / data2);
-
-        *((__global double *)((__global char *)dst + dst_index)) = tmp_data;
-    }
-}
-#endif
-/************************************div with scalar************************************/
-__kernel void arithm_s_div_D0 (__global uchar *src, int src_step, int src_offset,
-                               __global uchar *dst,  int dst_step,  int dst_offset,
-                               int rows, int cols, int dst_step1, F scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src_index = mad24(y, src_step, x + src_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        uchar4 src_data = vload4(0, src + src_index);
-        uchar4 dst_data  = *((__global uchar4 *)(dst + dst_index));
-
-        uchar4 tmp_data;
-        tmp_data.x = ((scalar == 0) || (src_data.x == 0)) ? 0 : round2_uchar(scalar / (F)src_data.x);
-        tmp_data.y = ((scalar == 0) || (src_data.y == 0)) ? 0 : round2_uchar(scalar / (F)src_data.y);
-        tmp_data.z = ((scalar == 0) || (src_data.z == 0)) ? 0 : round2_uchar(scalar / (F)src_data.z);
-        tmp_data.w = ((scalar == 0) || (src_data.w == 0)) ? 0 : round2_uchar(scalar / (F)src_data.w);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-__kernel void arithm_s_div_D2 (__global ushort *src, int src_step, int src_offset,
-                               __global ushort *dst,  int dst_step,  int dst_offset,
-                               int rows, int cols, int dst_step1, F scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src_index = mad24(y, src_step, (x << 1) + src_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
-
-        ushort4 src_data = vload4(0, (__global ushort *)((__global char *)src + src_index));
-        ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
-
-        ushort4 tmp_data;
-        tmp_data.x = ((scalar == 0) || (src_data.x == 0)) ? 0 : round2_ushort(scalar / (F)src_data.x);
-        tmp_data.y = ((scalar == 0) || (src_data.y == 0)) ? 0 : round2_ushort(scalar / (F)src_data.y);
-        tmp_data.z = ((scalar == 0) || (src_data.z == 0)) ? 0 : round2_ushort(scalar / (F)src_data.z);
-        tmp_data.w = ((scalar == 0) || (src_data.w == 0)) ? 0 : round2_ushort(scalar / (F)src_data.w);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global ushort4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-}
-__kernel void arithm_s_div_D3 (__global short *src, int src_step, int src_offset,
-                               __global short *dst,  int dst_step,  int dst_offset,
-                               int rows, int cols, int dst_step1, F scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src_index = mad24(y, src_step, (x << 1) + src_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
-
-        short4 src_data = vload4(0, (__global short *)((__global char *)src + src_index));
-        short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
-
-        short4 tmp_data;
-        tmp_data.x = ((scalar == 0) || (src_data.x == 0)) ? 0 : round2_short(scalar / (F)src_data.x);
-        tmp_data.y = ((scalar == 0) || (src_data.y == 0)) ? 0 : round2_short(scalar / (F)src_data.y);
-        tmp_data.z = ((scalar == 0) || (src_data.z == 0)) ? 0 : round2_short(scalar / (F)src_data.z);
-        tmp_data.w = ((scalar == 0) || (src_data.w == 0)) ? 0 : round2_short(scalar / (F)src_data.w);
-
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-}
-
-__kernel void arithm_s_div_D4 (__global int *src, int src_step, int src_offset,
-                               __global int *dst,  int dst_step,  int dst_offset,
-                               int rows, int cols, int dst_step1, F scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src_index = mad24(y, src_step, (x << 2) + src_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        int data = *((__global int *)((__global char *)src + src_index));
-
-        int tmp_data = (scalar == 0 || data == 0) ? 0 : round2_int(scalar / (convert_F)(data));
-
-        *((__global int *)((__global char *)dst + dst_index)) =tmp_data;
-    }
-}
-
-__kernel void arithm_s_div_D5 (__global float *src, int src_step, int src_offset,
-                               __global float *dst,  int dst_step,  int dst_offset,
-                               int rows, int cols, int dst_step1, F scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src_index = mad24(y, src_step, (x << 2) + src_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        float data = *((__global float *)((__global char *)src + src_index));
-
-        float tmp_data = (scalar == 0 || data == 0) ? 0 : convert_float(scalar / (convert_F)(data));
-
-        *((__global float *)((__global char *)dst + dst_index)) = tmp_data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_div_D6 (__global double *src, int src_step, int src_offset,
-                               __global double *dst,  int dst_step,  int dst_offset,
-                               int rows, int cols, int dst_step1, double scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src_index = mad24(y, src_step, (x << 3) + src_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        double data = *((__global double *)((__global char *)src + src_index));
-
-        double tmp_data = (scalar == 0 || data == 0) ? 0 : (scalar / data);
-
-        *((__global double *)((__global char *)dst + dst_index)) = tmp_data;
-    }
-}
-#endif
diff --git a/modules/ocl/src/opencl/arithm_mul.cl b/modules/ocl/src/opencl/arithm_mul.cl
deleted file mode 100644
index bfbb5942e..000000000
--- a/modules/ocl/src/opencl/arithm_mul.cl
+++ /dev/null
@@ -1,303 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other GpuMaterials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if defined (DOUBLE_SUPPORT)
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#elif defined (cl_amd_fp64)
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#endif
-#endif
-
-int4 round_int4(float4 v)
-{
-    v.s0 = v.s0 + (v.s0 > 0 ? 0.5 : -0.5);
-    v.s1 = v.s1 + (v.s1 > 0 ? 0.5 : -0.5);
-    v.s2 = v.s2 + (v.s2 > 0 ? 0.5 : -0.5);
-    v.s3 = v.s3 + (v.s3 > 0 ? 0.5 : -0.5);
-
-    return convert_int4_sat(v);
-}
-uint4 round_uint4(float4 v)
-{
-    v.s0 = v.s0 + (v.s0 > 0 ? 0.5 : -0.5);
-    v.s1 = v.s1 + (v.s1 > 0 ? 0.5 : -0.5);
-    v.s2 = v.s2 + (v.s2 > 0 ? 0.5 : -0.5);
-    v.s3 = v.s3 + (v.s3 > 0 ? 0.5 : -0.5);
-
-    return convert_uint4_sat(v);
-}
-long round_int(float v)
-{
-    v = v + (v > 0 ? 0.5 : -0.5);
-
-    return convert_int_sat(v);
-}
-//////////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////////////multiply//////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-/**************************************add without mask**************************************/
-__kernel void arithm_mul_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                             __global uchar *src2, int src2_step, int src2_offset,
-                             __global uchar *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1, float scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        uchar4 src1_data ,src2_data;
-
-        src1_data.x= src1_index+0 >= 0 ? src1[src1_index+0] : 0;
-        src1_data.y= src1_index+1 >= 0 ? src1[src1_index+1] : 0;
-        src1_data.z= src1_index+2 >= 0 ? src1[src1_index+2] : 0;
-        src1_data.w= src1_index+3 >= 0 ? src1[src1_index+3] : 0;
-
-        src2_data.x= src2_index+0 >= 0 ? src2[src2_index+0] : 0;
-        src2_data.y= src2_index+1 >= 0 ? src2[src2_index+1] : 0;
-        src2_data.z= src2_index+2 >= 0 ? src2[src2_index+2] : 0;
-        src2_data.w= src2_index+3 >= 0 ? src2[src2_index+3] : 0;
-
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-        int4 tmp      = convert_int4_sat(src1_data) * convert_int4_sat(src2_data);
-        tmp = round_int4(convert_float4(tmp) * scalar);
-        uchar4 tmp_data = convert_uchar4_sat(tmp);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-__kernel void arithm_mul_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                             __global ushort *src2, int src2_step, int src2_offset,
-                             __global ushort *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1, float scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
-
-        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
-
-        ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
-        uint4    tmp = convert_uint4_sat(src1_data) * convert_uint4_sat(src2_data);
-        tmp = round_uint4(convert_float4(tmp) * scalar);
-        ushort4 tmp_data = convert_ushort4_sat(tmp);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global ushort4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-}
-__kernel void arithm_mul_D3 (__global short *src1, int src1_step, int src1_offset,
-                             __global short *src2, int src2_step, int src2_offset,
-                             __global short *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1, float scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
-
-        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
-
-        short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
-        int4   tmp = convert_int4_sat(src1_data) * convert_int4_sat(src2_data);
-        tmp = round_int4(convert_float4(tmp) * scalar);
-        short4 tmp_data = convert_short4_sat(tmp);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-}
-
-__kernel void arithm_mul_D4 (__global int *src1, int src1_step, int src1_offset,
-                             __global int *src2, int src2_step, int src2_offset,
-                             __global int *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1, float scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        int data1 = *((__global int *)((__global char *)src1 + src1_index));
-        int data2 = *((__global int *)((__global char *)src2 + src2_index));
-        int tmp  = data1 * data2;
-        tmp = round_int((float)tmp * scalar);
-
-        *((__global int *)((__global char *)dst + dst_index)) = convert_int_sat(tmp);
-    }
-}
-__kernel void arithm_mul_D5 (__global float *src1, int src1_step, int src1_offset,
-                             __global float *src2, int src2_step, int src2_offset,
-                             __global float *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1, float scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        float data1 = *((__global float *)((__global char *)src1 + src1_index));
-        float data2 = *((__global float *)((__global char *)src2 + src2_index));
-        float tmp = data1 * data2;
-        tmp = tmp * scalar;
-
-        *((__global float *)((__global char *)dst + dst_index)) = tmp;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_mul_D6 (__global double *src1, int src1_step, int src1_offset,
-                             __global double *src2, int src2_step, int src2_offset,
-                             __global double *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1, double scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        double data1 = *((__global double *)((__global char *)src1 + src1_index));
-        double data2 = *((__global double *)((__global char *)src2 + src2_index));
-
-        double tmp = data1 * data2;
-        tmp = tmp * scalar;
-
-        *((__global double *)((__global char *)dst + dst_index)) = tmp;
-    }
-}
-#endif
-
-#ifdef DOUBLE_SUPPORT
-#define SCALAR_TYPE double
-#else
-#define SCALAR_TYPE float
-#endif
-
-__kernel void arithm_muls_D5 (__global float *src1, int src1_step, int src1_offset,
-                              __global float *dst,  int dst_step,  int dst_offset,
-                              int rows, int cols, int dst_step1, SCALAR_TYPE scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        float data1 = *((__global float *)((__global char *)src1 + src1_index));
-        float tmp = data1 * scalar;
-
-        *((__global float *)((__global char *)dst + dst_index)) = tmp;
-    }
-}

From bd36e556a1ff1d9df8b12a2becec692570ef3cc0 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Tue, 24 Sep 2013 13:39:07 +0400
Subject: [PATCH 04/17] removed ocl::magnitudeSqr

---
 modules/ocl/include/opencv2/ocl/ocl.hpp       |   3 -
 modules/ocl/perf/perf_arithm.cpp              |  48 -----
 modules/ocl/src/arithm.cpp                    |  88 ---------
 modules/ocl/src/opencl/arithm_magnitudeSqr.cl | 177 ------------------
 modules/ocl/test/test_arithm.cpp              |  26 ---
 5 files changed, 342 deletions(-)
 delete mode 100644 modules/ocl/src/opencl/arithm_magnitudeSqr.cl

diff --git a/modules/ocl/include/opencv2/ocl/ocl.hpp b/modules/ocl/include/opencv2/ocl/ocl.hpp
index 2bfc7db45..d46ad503e 100644
--- a/modules/ocl/include/opencv2/ocl/ocl.hpp
+++ b/modules/ocl/include/opencv2/ocl/ocl.hpp
@@ -538,9 +538,6 @@ namespace cv
         //! computes magnitude of each (x(i), y(i)) vector
         // supports only CV_32F CV_64F type
         CV_EXPORTS void magnitude(const oclMat &x, const oclMat &y, oclMat &magnitude);
-        CV_EXPORTS void magnitudeSqr(const oclMat &x, const oclMat &y, oclMat &magnitude);
-
-        CV_EXPORTS void magnitudeSqr(const oclMat &x, oclMat &magnitude);
 
         //! computes angle (angle(i)) of each (x(i), y(i)) vector
         // supports only CV_32F CV_64F type
diff --git a/modules/ocl/perf/perf_arithm.cpp b/modules/ocl/perf/perf_arithm.cpp
index 814b272f0..d718ed551 100644
--- a/modules/ocl/perf/perf_arithm.cpp
+++ b/modules/ocl/perf/perf_arithm.cpp
@@ -842,54 +842,6 @@ PERF_TEST_P(PowFixture, pow, OCL_TYPICAL_MAT_SIZES)
         OCL_PERF_ELSE
 }
 
-///////////// MagnitudeSqr////////////////////////
-
-typedef TestBaseWithParam<Size> MagnitudeSqrFixture;
-
-PERF_TEST_P(MagnitudeSqrFixture, MagnitudeSqr, OCL_TYPICAL_MAT_SIZES)
-{
-    const Size srcSize = GetParam();
-
-    Mat src1(srcSize, CV_32FC1), src2(srcSize, CV_32FC1),
-            dst(srcSize, CV_32FC1);
-    declare.in(src1, src2, WARMUP_RNG).out(dst);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc1(src1), oclSrc2(src2), oclDst(srcSize, src1.type());
-
-        OCL_TEST_CYCLE() cv::ocl::magnitudeSqr(oclSrc1, oclSrc2, oclDst);
-
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        ASSERT_EQ(1, src1.channels());
-
-        TEST_CYCLE()
-        {
-            for (int y = 0; y < srcSize.height; ++y)
-            {
-                const float * const src1Data = reinterpret_cast<float *>(src1.data + src1.step * y);
-                const float * const src2Data = reinterpret_cast<float *>(src2.data + src2.step * y);
-                float * const dstData = reinterpret_cast<float *>(dst.data + dst.step * y);
-                for (int x = 0; x < srcSize.width; ++x)
-                {
-                    float t0 = src1Data[x] * src1Data[x];
-                    float t1 = src2Data[x] * src2Data[x];
-                    dstData[x] = t0 + t1;
-                }
-            }
-        }
-
-        SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
 ///////////// AddWeighted////////////////////////
 
 typedef Size_MatType AddWeightedFixture;
diff --git a/modules/ocl/src/arithm.cpp b/modules/ocl/src/arithm.cpp
index 03c314c7c..5794f1316 100644
--- a/modules/ocl/src/arithm.cpp
+++ b/modules/ocl/src/arithm.cpp
@@ -84,7 +84,6 @@ namespace cv
         extern const char *arithm_bitwise_not;
         extern const char *arithm_compare_eq;
         extern const char *arithm_compare_ne;
-        extern const char *arithm_magnitudeSqr;
         extern const char *arithm_transpose;
         extern const char *arithm_flip;
         extern const char *arithm_flip_rc;
@@ -1911,93 +1910,6 @@ void cv::ocl::addWeighted(const oclMat &src1, double alpha, const oclMat &src2,
     openCLExecuteKernel(clCxt, &arithm_addWeighted, "addWeighted", globalThreads, localThreads, args, -1, depth);
 }
 
-void cv::ocl::magnitudeSqr(const oclMat &src1, const oclMat &src2, oclMat &dst)
-{
-    CV_Assert(src1.type() == src2.type() && src1.size() == src2.size() &&
-              (src1.depth() == CV_32F ));
-
-    dst.create(src1.size(), src1.type());
-
-
-    Context *clCxt = src1.clCxt;
-    int channels = dst.oclchannels();
-    int depth = dst.depth();
-
-
-    int vector_lengths[4][7] = {{4, 0, 4, 4, 4, 4, 4},
-        {4, 0, 4, 4, 4, 4, 4},
-        {4, 0, 4, 4, 4, 4, 4},
-        {4, 0, 4, 4, 4, 4, 4}
-    };
-
-
-    size_t vector_length = vector_lengths[channels - 1][depth];
-    int offset_cols = (dst.offset / dst.elemSize1()) & (vector_length - 1);
-    int cols = divUp(dst.cols * channels + offset_cols, vector_length);
-
-    size_t localThreads[3]  = { 256, 1, 1 };
-    size_t globalThreads[3] = { cols, dst.rows, 1 };
-
-    int dst_step1 = dst.cols * dst.elemSize();
-    vector<pair<size_t , const void *> > args;
-    args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src1.step ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src1.offset));
-    args.push_back( make_pair( sizeof(cl_mem), (void *)&src2.data ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src2.step ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src2.offset));
-    args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src1.rows ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&cols ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 ));
-
-    openCLExecuteKernel(clCxt, &arithm_magnitudeSqr, "magnitudeSqr", globalThreads, localThreads, args, 1, depth);
-}
-
-void cv::ocl::magnitudeSqr(const oclMat &src1, oclMat &dst)
-{
-    CV_Assert (src1.depth() == CV_32F );
-    CV_Assert(src1.size() == dst.size());
-
-    dst.create(src1.size(), CV_32FC1);
-
-
-    Context *clCxt = src1.clCxt;
-    int channels = dst.oclchannels();
-    int depth = dst.depth();
-
-
-    int vector_lengths[4][7] = {{4, 0, 4, 4, 4, 4, 4},
-        {4, 0, 4, 4, 4, 4, 4},
-        {4, 0, 4, 4, 4, 4, 4},
-        {4, 0, 4, 4, 4, 4, 4}
-    };
-
-
-    size_t vector_length = vector_lengths[channels - 1][depth];
-    int offset_cols = (dst.offset / dst.elemSize1()) & (vector_length - 1);
-    int cols = divUp(dst.cols * channels + offset_cols, vector_length);
-
-    size_t localThreads[3]  = { 256, 1, 1 };
-    size_t globalThreads[3] = { cols, dst.rows, 1 };
-
-    int dst_step1 = dst.cols * dst.elemSize();
-    vector<pair<size_t , const void *> > args;
-    args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src1.step ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src1.offset));
-    args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src1.rows ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&cols ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 ));
-
-    openCLExecuteKernel(clCxt, &arithm_magnitudeSqr, "magnitudeSqr", globalThreads, localThreads, args, 2, depth);
-}
-
 static void arithmetic_pow_run(const oclMat &src1, double p, oclMat &dst, string kernelName, const char **kernelString)
 {
     CV_Assert(src1.cols == dst.cols && src1.rows == dst.rows);
diff --git a/modules/ocl/src/opencl/arithm_magnitudeSqr.cl b/modules/ocl/src/opencl/arithm_magnitudeSqr.cl
deleted file mode 100644
index 3fd697ff1..000000000
--- a/modules/ocl/src/opencl/arithm_magnitudeSqr.cl
+++ /dev/null
@@ -1,177 +0,0 @@
-
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other oclMaterials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this softwareif advised of the possibility of such damage.
-//
-//M*/
-
-#if defined (DOUBLE_SUPPORT)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-
-//////////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////////////magnitudeSqr//////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-__kernel void magnitudeSqr_C1_D5 (__global float *src1,int src1_step,int src1_offset,
-                           __global float *src2, int src2_step,int src2_offset,
-                           __global float *dst,  int dst_step,int dst_offset,
-                           int rows,  int cols,int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-
-
-    {
-
-        x = x << 2;
-
-        #define dst_align ((dst_offset >> 2) & 3)
-
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
-    int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-    int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        float4 src1_data = vload4(0, (__global float  *)((__global char *)src1 + src1_index_fix));
-        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
-    if(src1_index < 0)
-    {
-        float4 tmp;
-        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-    }
-    if(src2_index < 0)
-    {
-        float4 tmp;
-        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-    }
-        float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index));
-
-        float4   tmp_data  ;
-      tmp_data.x = src1_data.x * src1_data.x + src2_data.x * src2_data.x;
-
-      tmp_data.y = src1_data.y * src1_data.y + src2_data.y * src2_data.y;
-
-      tmp_data.z = src1_data.z * src1_data.z + src2_data.z * src2_data.z;
-
-      tmp_data.w = src1_data.w * src1_data.w + src2_data.w * src2_data.w;
-
-
-
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 8 >= dst_start) && (dst_index + 8 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 12 >= dst_start) && (dst_index + 12 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global float4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-
-}
-
-
-#if defined (DOUBLE_SUPPORT)
-
-__kernel void magnitudeSqr_C2_D5 (__global float *src1,int src1_step,int src1_offset,
-                           __global float *dst,  int dst_step,int dst_offset,
-                           int rows,  int cols,int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-
-
-    {
-
-        x = x << 2;
-
-        #define dst_align ((dst_offset >> 2) & 3)
-
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
-    int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-
-        float8 src1_data = vload8(0, (__global float  *)((__global char *)src1 + src1_index_fix));
-
-    if(src1_index==-6)
-          src1_data.s01234567 = src1_data.s67012345;
-    if(src1_index==-4)
-          src1_data.s01234567 = src1_data.s45670123;
-    if(src1_index== -2)
-          src1_data.s01234567 = src1_data.s23456701;
-
-
-
-        float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index));
-
-        float4   tmp_data  ;
-      tmp_data.x = src1_data.s0 * src1_data.s0 + src1_data.s1 * src1_data.s1;
-
-      tmp_data.y = src1_data.s2 * src1_data.s2 + src1_data.s3 * src1_data.s3;
-
-      tmp_data.z = src1_data.s4 * src1_data.s4 + src1_data.s5 * src1_data.s5;
-
-      tmp_data.w = src1_data.s6 * src1_data.s6 + src1_data.s7 * src1_data.s7;
-
-
-
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 8 >= dst_start) && (dst_index + 8 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 12 >= dst_start) && (dst_index + 12 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global float4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-
-}
-#endif
diff --git a/modules/ocl/test/test_arithm.cpp b/modules/ocl/test/test_arithm.cpp
index fa9d09999..43afd1342 100644
--- a/modules/ocl/test/test_arithm.cpp
+++ b/modules/ocl/test/test_arithm.cpp
@@ -1159,28 +1159,6 @@ TEST_P(Pow, Mat)
 }
 
 
-struct MagnitudeSqr : ArithmTestBase {};
-
-TEST_P(MagnitudeSqr, Mat)
-{
-    for(int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-        for(int i = 0; i < mat1.rows; ++i)
-            for(int j = 0; j < mat1.cols; ++j)
-            {
-                float val1 = mat1.at<float>(i, j);
-                float val2 = mat2.at<float>(i, j);
-                ((float *)(dst.data))[i * dst.step / 4 + j] = val1 * val1 + val2 * val2;
-            }
-
-        cv::ocl::oclMat clmat1(mat1), clmat2(mat2);
-        cv::ocl::magnitudeSqr(clmat1, clmat2, gdst);
-        Near(1);
-    }
-}
-
-
 struct AddWeighted : ArithmTestBase {};
 
 TEST_P(AddWeighted, Mat)
@@ -1302,10 +1280,6 @@ INSTANTIATE_TEST_CASE_P(Arithm, Compare, Combine(Values(CV_8UC1, CV_32SC1, CV_32
 INSTANTIATE_TEST_CASE_P(Arithm, Pow, Combine(Values(CV_32FC1, CV_32FC3, CV_32FC4), Values(false)));
 // Values(false) is the reserved parameter
 
-INSTANTIATE_TEST_CASE_P(Arithm, MagnitudeSqr, Combine(
-                            Values(CV_32FC1, CV_32FC1),
-                            Values(false))); // Values(false) is the reserved parameter
-
 INSTANTIATE_TEST_CASE_P(Arithm, AddWeighted, Combine(
                             Values(CV_8UC1, CV_32SC1, CV_32FC1),
                             Values(false))); // Values(false) is the reserved parameter

From 311a7233c2b66822ab5c8367d4a4c258eb782cd7 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Tue, 24 Sep 2013 13:43:12 +0400
Subject: [PATCH 05/17] removed comments from filtering.cpp and imgproc.cpp in
 ocl module

---
 modules/ocl/src/filtering.cpp | 79 +----------------------------------
 modules/ocl/src/imgproc.cpp   | 31 +-------------
 2 files changed, 4 insertions(+), 106 deletions(-)

diff --git a/modules/ocl/src/filtering.cpp b/modules/ocl/src/filtering.cpp
index e252d852c..284dc6163 100644
--- a/modules/ocl/src/filtering.cpp
+++ b/modules/ocl/src/filtering.cpp
@@ -130,7 +130,7 @@ public:
     {
         Size src_size = src.size();
 
-        // Delete those two clause below which exist before, However, the result is alos correct
+        // Delete those two clause below which exist before, However, the result is also correct
         // dst.create(src_size, src.type());
         // dst = Scalar(0.0);
 
@@ -394,23 +394,8 @@ public:
     {
         Filter2DEngine_GPU::apply(src, dst);
 
-        //if (iters > 1)
-        //{
-        // Size wholesize;
-        // Point ofs;
-        // dst.locateROI(wholesize,ofs);
-        // int rows = dst.rows, cols = dst.cols;
-        // dst.adjustROI(ofs.y,-ofs.y-rows+dst.wholerows,ofs.x,-ofs.x-cols+dst.wholecols);
-        // dst.copyTo(morfBuf);
-        // dst.adjustROI(-ofs.y,ofs.y+rows-dst.wholerows,-ofs.x,ofs.x+cols-dst.wholecols);
-        // morfBuf.adjustROI(-ofs.y,ofs.y+rows-dst.wholerows,-ofs.x,ofs.x+cols-dst.wholecols);
-        // //morfBuf.create(src.size(),src.type());
-        // //Filter2DEngine_GPU::apply(dst, morfBuf);
-        // //morfBuf.copyTo(dst);
-        //}
         for (int i = 1; i < iters; ++i)
         {
-            //dst.swap(morfBuf);
             Size wholesize;
             Point ofs;
             dst.locateROI(wholesize, ofs);
@@ -720,24 +705,16 @@ public:
     virtual void apply(const oclMat &src, oclMat &dst, Rect roi = Rect(0, 0, -1, -1))
     {
         Size src_size = src.size();
-        //int src_type = src.type();
 
         int cn = src.oclchannels();
-        //dst.create(src_size, src_type);
-        //dst = Scalar(0.0);
-        //dstBuf.create(src_size, src_type);
         dstBuf.create(src_size.height + ksize.height - 1, src_size.width, CV_MAKETYPE(CV_32F, cn));
-        //dstBuf = Scalar(0.0);
 
         normalizeROI(roi, ksize, anchor, src_size);
 
         srcROI = src(roi);
         dstROI = dst(roi);
-        //dstBufROI = dstBuf(roi);
 
         (*rowFilter)(srcROI, dstBuf);
-        //Mat rm(dstBufROI);
-        //std::cout << "rm " << rm << endl;
         (*columnFilter)(dstBuf, dstROI);
     }
 
@@ -1324,11 +1301,8 @@ void linearColumnFilter_gpu(const oclMat &src, const oclMat &dst, oclMat mat_ker
     CV_Assert(src.oclchannels() == dst.oclchannels());
     CV_Assert(ksize == (anchor << 1) + 1);
     int src_pix_per_row, dst_pix_per_row;
-    //int src_offset_x, src_offset_y;
     int dst_offset_in_pixel;
     src_pix_per_row = src.step / src.elemSize();
-    //src_offset_x = (src.offset % src.step) / src.elemSize();
-    //src_offset_y = src.offset / src.step;
     dst_pix_per_row = dst.step / dst.elemSize();
     dst_offset_in_pixel = dst.offset / dst.elemSize();
 
@@ -1340,8 +1314,6 @@ void linearColumnFilter_gpu(const oclMat &src, const oclMat &dst, oclMat mat_ker
     args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholecols));
     args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholerows));
     args.push_back(make_pair(sizeof(cl_int), (void *)&src_pix_per_row));
-    //args.push_back(make_pair(sizeof(cl_int),(void*)&src_offset_x));
-    //args.push_back(make_pair(sizeof(cl_int),(void*)&src_offset_y));
     args.push_back(make_pair(sizeof(cl_int), (void *)&dst_pix_per_row));
     args.push_back(make_pair(sizeof(cl_int), (void *)&dst_offset_in_pixel));
     args.push_back(make_pair(sizeof(cl_mem), (void *)&mat_kernel.data));
@@ -1360,23 +1332,11 @@ Ptr<BaseColumnFilter_GPU> cv::ocl::getLinearColumnFilter_GPU(int /*bufType*/, in
         linearColumnFilter_gpu<int>,
         linearColumnFilter_gpu<float>
     };
-    /*
-    CV_Assert(dstType == CV_8UC4 || dstType == CV_8SC4 || dstType == CV_16UC2 ||
-    dstType == CV_16SC2 || dstType == CV_32SC1 || dstType == CV_32FC1);
-    CV_Assert(bufType == CV_8UC4 || bufType == CV_8SC4 || bufType == CV_16UC2 ||
-    bufType == CV_16SC2 || bufType == CV_32SC1 || bufType == CV_32FC1);
 
-    Mat temp(columnKernel.size(), CV_32SC1);
-    columnKernel.convertTo(temp, CV_32SC1);
-    Mat cont_krnl = temp.reshape(1, 1);
-    */
     Mat temp = columnKernel.reshape(1, 1);
     oclMat mat_kernel(temp);
 
     int ksize = temp.cols;
-
-    //CV_Assert(ksize < 16);
-
     normalizeAnchor(anchor, ksize);
 
     return Ptr<BaseColumnFilter_GPU>(new GpuLinearColumnFilter(ksize, anchor, mat_kernel,
@@ -1414,11 +1374,8 @@ void cv::ocl::sepFilter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat
     }
 
     if (ddepth < 0)
-    {
         ddepth = src.depth();
-    }
 
-    //CV_Assert(ddepth == src.depth());
     dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels()));
 
     Ptr<FilterEngine_GPU> f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, anchor, delta, bordertype);
@@ -1445,19 +1402,11 @@ void cv::ocl::Sobel(const oclMat &src, oclMat &dst, int ddepth, int dx, int dy,
         // usually the smoothing part is the slowest to compute,
         // so try to scale it instead of the faster differenciating part
         if (dx == 0)
-        {
             kx *= scale;
-        }
         else
-        {
             ky *= scale;
-        }
     }
 
-    // Mat kx_, ky_;
-    //ky.convertTo(ky_,CV_32S,1<<8);
-    //kx.convertTo(kx_,CV_32S,1<<8);
-
     sepFilter2D(src, dst, ddepth, kx, ky, Point(-1, -1), delta, borderType);
 }
 
@@ -1471,19 +1420,11 @@ void cv::ocl::Scharr(const oclMat &src, oclMat &dst, int ddepth, int dx, int dy,
         // usually the smoothing part is the slowest to compute,
         // so try to scale it instead of the faster differenciating part
         if (dx == 0)
-        {
             kx *= scale;
-        }
         else
-        {
             ky *= scale;
-        }
     }
 
-    // Mat kx_, ky_;
-    //ky.convertTo(ky_,CV_32S,1<<8);
-    //kx.convertTo(kx_,CV_32S,1<<8);
-
     sepFilter2D(src, dst, ddepth, kx, ky, Point(-1, -1), delta, bordertype);
 }
 
@@ -1505,9 +1446,7 @@ void cv::ocl::Laplacian(const oclMat &src, oclMat &dst, int ddepth, int ksize, d
     Mat kernel(3, 3, CV_32S, (void *)K[ksize == 3]);
 
     if (scale != 1)
-    {
         kernel *= scale;
-    }
 
     filter2D(src, dst, ddepth, kernel, Point(-1, -1));
 }
@@ -1526,14 +1465,10 @@ Ptr<FilterEngine_GPU> cv::ocl::createGaussianFilter_GPU(int type, Size ksize, do
 
     // automatic detection of kernel size from sigma
     if (ksize.width <= 0 && sigma1 > 0)
-    {
         ksize.width = cvRound(sigma1 * (depth == CV_8U ? 3 : 4) * 2 + 1) | 1;
-    }
 
     if (ksize.height <= 0 && sigma2 > 0)
-    {
         ksize.height = cvRound(sigma2 * (depth == CV_8U ? 3 : 4) * 2 + 1) | 1;
-    }
 
     CV_Assert(ksize.width > 0 && ksize.width % 2 == 1 && ksize.height > 0 && ksize.height % 2 == 1);
 
@@ -1544,17 +1479,10 @@ Ptr<FilterEngine_GPU> cv::ocl::createGaussianFilter_GPU(int type, Size ksize, do
     Mat ky;
 
     if (ksize.height == ksize.width && std::abs(sigma1 - sigma2) < DBL_EPSILON)
-    {
         ky = kx;
-    }
     else
-    {
         ky = getGaussianKernel(ksize.height, sigma2, std::max(depth, CV_32F));
-    }
 
-    //Mat kx_, ky_;
-    //kx.convertTo(kx_,CV_32S,1<<8);
-    //ky.convertTo(ky_,CV_32S,1<<8);
     return createSeparableLinearFilter_GPU(type, type, kx, ky, Point(-1, -1), 0.0, bordertype);
 }
 
@@ -1585,14 +1513,10 @@ void cv::ocl::GaussianBlur(const oclMat &src, oclMat &dst, Size ksize, double si
     if (bordertype != BORDER_CONSTANT)
     {
         if (src.rows == 1)
-        {
             ksize.height = 1;
-        }
 
         if (src.cols == 1)
-        {
             ksize.width = 1;
-        }
     }
 
     Ptr<FilterEngine_GPU> f = createGaussianFilter_GPU(src.type(), ksize, sigma1, sigma2, bordertype);
@@ -1618,6 +1542,7 @@ void cv::ocl::adaptiveBilateralFilter(const oclMat& src, oclMat& dst, Size ksize
     {
         lut.at<float>(idx++) = sigma2 / (sigma2 + x * x + y * y);
     }
+
     oclMat dlut(lut);
     int depth = src.depth();
     int cn = src.oclchannels();
diff --git a/modules/ocl/src/imgproc.cpp b/modules/ocl/src/imgproc.cpp
index 2ed786fe4..7d0d941df 100644
--- a/modules/ocl/src/imgproc.cpp
+++ b/modules/ocl/src/imgproc.cpp
@@ -244,9 +244,6 @@ namespace cv
                     kernelName = "remapNNF1Constant";
             }
 
-            //int channels = dst.oclchannels();
-            //int depth = dst.depth();
-            //int type = src.type();
             size_t blkSizeX = 16, blkSizeY = 16;
             size_t glbSizeX;
             int cols = dst.cols;
@@ -499,21 +496,13 @@ namespace cv
                 openCLExecuteKernel(clCxt, &imgproc_median, kernelName, globalThreads, localThreads, args, src.oclchannels(), src.depth());
             }
             else
-            {
                 CV_Error(CV_StsUnsupportedFormat, "Non-supported filter length");
-                //string kernelName = "medianFilter";
-                //args.push_back( make_pair( sizeof(cl_int),(void*)&m));
-
-                //openCLExecuteKernel(clCxt,&imgproc_median,kernelName,globalThreads,localThreads,args,src.oclchannels(),-1);
-            }
-
         }
 
         ////////////////////////////////////////////////////////////////////////
         // copyMakeBorder
         void copyMakeBorder(const oclMat &src, oclMat &dst, int top, int bottom, int left, int right, int bordertype, const Scalar &scalar)
         {
-            //CV_Assert(src.oclchannels() != 2);
             CV_Assert(top >= 0 && bottom >= 0 && left >= 0 && right >= 0);
             if((dst.cols != dst.wholecols) || (dst.rows != dst.wholerows)) //has roi
             {
@@ -529,10 +518,12 @@ namespace cv
             {
                 CV_Assert((src.cols >= left) && (src.cols >= right) && (src.rows >= top) && (src.rows >= bottom));
             }
+
             if(bordertype == cv::BORDER_REFLECT_101)
             {
                 CV_Assert((src.cols > left) && (src.cols > right) && (src.rows > top) && (src.rows > bottom));
             }
+
             dst.create(src.rows + top + bottom, src.cols + left + right, src.type());
             int srcStep = src.step1() / src.oclchannels();
             int dstStep = dst.step1() / dst.oclchannels();
@@ -732,19 +723,6 @@ namespace cv
             }
 
             openCLExecuteKernel(src.clCxt, &imgproc_copymakeboder, kernelName, globalThreads, localThreads, args, -1, -1, compile_option);
-            //uchar* cputemp=new uchar[32*dst.wholerows];
-            ////int* cpudata=new int[this->step*this->wholerows/sizeof(int)];
-            //openCLSafeCall(clEnqueueReadBuffer(src.clCxt->impl->clCmdQueue, (cl_mem)dst.data, CL_TRUE,
-            //						0, 32*dst.wholerows, cputemp, 0, NULL, NULL));
-            //for(int i=0;i<dst.wholerows;i++)
-            //{
-            //	for(int j=0;j<dst.wholecols;j++)
-            //	{
-            //		cout<< (int)cputemp[i*32+j]<<" ";
-            //	}
-            //	cout<<endl;
-            //}
-            //delete []cputemp;
         }
 
         ////////////////////////////////////////////////////////////////////////
@@ -1286,11 +1264,6 @@ namespace cv
             if( src.depth() != CV_8U || src.oclchannels() != 4 )
                 CV_Error( CV_StsUnsupportedFormat, "Only 8-bit, 4-channel images are supported" );
 
-            //            if(!src.clCxt->supportsFeature(Context::CL_DOUBLE))
-            //            {
-            //                CV_Error( CV_GpuNotSupported, "Selected device doesn't support double, so a deviation exists.\nIf the accuracy is acceptable, the error can be ignored.\n");
-            //            }
-
             dst.create( src.size(), CV_8UC4 );
 
             if( !(criteria.type & TermCriteria::MAX_ITER) )

From b20bd470fe97f4e058d114e160c7f95969282531 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Tue, 24 Sep 2013 13:49:38 +0400
Subject: [PATCH 06/17] refactored and extended ocl::LUT

---
 modules/ocl/src/arithm.cpp           | 121 +++++-------------
 modules/ocl/src/opencl/arithm_LUT.cl | 175 +++++++++------------------
 2 files changed, 91 insertions(+), 205 deletions(-)

diff --git a/modules/ocl/src/arithm.cpp b/modules/ocl/src/arithm.cpp
index 5794f1316..66180ba4d 100644
--- a/modules/ocl/src/arithm.cpp
+++ b/modules/ocl/src/arithm.cpp
@@ -793,100 +793,45 @@ void cv::ocl::flip(const oclMat &src, oclMat &dst, int flipCode)
 //////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////// LUT  //////////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////
-static void arithmetic_lut_run(const oclMat &src1, const oclMat &src2, oclMat &dst, string kernelName)
-{
-    Context *clCxt = src1.clCxt;
-    int channels = src1.oclchannels();
-    int rows = src1.rows;
-    int cols = src1.cols;
-    //int step = src1.step;
-    int src_step = src1.step / src1.elemSize();
-    int dst_step = dst.step / dst.elemSize();
-    int whole_rows = src1.wholerows;
-    int whole_cols = src1.wholecols;
-    int src_offset = src1.offset / src1.elemSize();
-    int dst_offset = dst.offset / dst.elemSize();
-    int lut_offset = src2.offset / src2.elemSize();
-    int left_col = 0, right_col = 0;
-    size_t localSize[] = {16, 16, 1};
-    //cl_kernel kernel = openCLGetKernelFromSource(clCxt,&arithm_LUT,kernelName);
-    size_t globalSize[] = {(cols + localSize[0] - 1) / localSize[0] *localSize[0], (rows + localSize[1] - 1) / localSize[1] *localSize[1], 1};
-    if(channels == 1 && cols > 6)
-    {
-        left_col = 4 - (dst_offset & 3);
-        left_col &= 3;
-        dst_offset += left_col;
-        src_offset += left_col;
-        cols -= left_col;
-        right_col = cols & 3;
-        cols -= right_col;
-        globalSize[0] = (cols / 4 + localSize[0] - 1) / localSize[0] * localSize[0];
-    }
-    else if(channels == 1)
-    {
-        left_col = cols;
-        right_col = 0;
-        cols = 0;
-        globalSize[0] = 0;
-    }
-    CV_Assert(clCxt == dst.clCxt);
-    CV_Assert(src1.cols == dst.cols);
-    CV_Assert(src1.rows == dst.rows);
-    CV_Assert(src1.oclchannels() == dst.oclchannels());
-    //  CV_Assert(src1.step == dst.step);
-    vector<pair<size_t , const void *> > args;
 
-    if(globalSize[0] != 0)
-    {
-        args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
-        args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data ));
-        args.push_back( make_pair( sizeof(cl_mem), (void *)&src2.data ));
-        args.push_back( make_pair( sizeof(cl_int), (void *)&rows ));
-        args.push_back( make_pair( sizeof(cl_int), (void *)&cols ));
-        args.push_back( make_pair( sizeof(cl_int), (void *)&channels ));
-        args.push_back( make_pair( sizeof(cl_int), (void *)&whole_rows ));
-        args.push_back( make_pair( sizeof(cl_int), (void *)&whole_cols ));
-        args.push_back( make_pair( sizeof(cl_int), (void *)&src_offset ));
-        args.push_back( make_pair( sizeof(cl_int), (void *)&dst_offset ));
-        args.push_back( make_pair( sizeof(cl_int), (void *)&lut_offset ));
-        args.push_back( make_pair( sizeof(cl_int), (void *)&src_step ));
-        args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step ));
-        openCLExecuteKernel(clCxt, &arithm_LUT, kernelName, globalSize, localSize, args, src1.oclchannels(), src1.depth());
-    }
-    if(channels == 1 && (left_col != 0 || right_col != 0))
-    {
-        src_offset = src1.offset;
-        dst_offset = dst.offset;
-        localSize[0] = 1;
-        localSize[1] = 256;
-        globalSize[0] = left_col + right_col;
-        globalSize[1] = (rows + localSize[1] - 1) / localSize[1] * localSize[1];
-        //kernel = openCLGetKernelFromSource(clCxt,&arithm_LUT,"LUT2");
-        args.clear();
-        args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
-        args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data ));
-        args.push_back( make_pair( sizeof(cl_mem), (void *)&src2.data ));
-        args.push_back( make_pair( sizeof(cl_int), (void *)&rows ));
-        args.push_back( make_pair( sizeof(cl_int), (void *)&left_col ));
-        args.push_back( make_pair( sizeof(cl_int), (void *)&channels ));
-        args.push_back( make_pair( sizeof(cl_int), (void *)&whole_rows ));
-        args.push_back( make_pair( sizeof(cl_int), (void *)&cols ));
-        args.push_back( make_pair( sizeof(cl_int), (void *)&src_offset ));
-        args.push_back( make_pair( sizeof(cl_int), (void *)&dst_offset ));
-        args.push_back( make_pair( sizeof(cl_int), (void *)&lut_offset ));
-        args.push_back( make_pair( sizeof(cl_int), (void *)&src_step ));
-        args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step ));
-        openCLExecuteKernel(clCxt, &arithm_LUT, "LUT2", globalSize, localSize, args, src1.oclchannels(), src1.depth());
-    }
+static void arithmetic_lut_run(const oclMat &src, const oclMat &lut, oclMat &dst, string kernelName)
+{
+    Context *clCxt = src.clCxt;
+    int sdepth = src.depth();
+    int src_step1 = src.step1(), dst_step1 = dst.step1();
+    int src_offset1 = src.offset / src.elemSize1(), dst_offset1 = dst.offset / dst.elemSize1();
+    int lut_offset1 = lut.offset / lut.elemSize1() + (sdepth == CV_8U ? 0 : 128) * lut.channels();
+    int cols1 = src.cols * src.oclchannels();
+
+    size_t localSize[] = { 16, 16, 1 };
+    size_t globalSize[] = { lut.channels() == 1 ? cols1 : src.cols, src.rows, 1 };
+
+    const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
+    std::string buildOptions = format("-D srcT=%s -D dstT=%s", typeMap[sdepth], typeMap[dst.depth()]);
+
+    vector<pair<size_t , const void *> > args;
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data ));
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&lut.data ));
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&cols1));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src_offset1 ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&lut_offset1 ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&dst_offset1 ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src_step1 ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 ));
+
+    openCLExecuteKernel(clCxt, &arithm_LUT, kernelName, globalSize, localSize,
+                        args, lut.oclchannels(), -1, buildOptions.c_str());
 }
 
 void cv::ocl::LUT(const oclMat &src, const oclMat &lut, oclMat &dst)
 {
-    int cn = src.channels();
-    CV_Assert(src.depth() == CV_8U);
-    CV_Assert((lut.oclchannels() == 1 || lut.oclchannels() == cn) && lut.rows == 1 && lut.cols == 256);
+    int cn = src.channels(), depth = src.depth();
+    CV_Assert(depth == CV_8U || depth == CV_8S);
+    CV_Assert(lut.channels() == 1 || lut.channels() == src.channels());
+    CV_Assert(lut.rows == 1 && lut.cols == 256);
     dst.create(src.size(), CV_MAKETYPE(lut.depth(), cn));
-    //oclMat _lut(lut);
     string kernelName = "LUT";
     arithmetic_lut_run(src, lut, dst, kernelName);
 }
diff --git a/modules/ocl/src/opencl/arithm_LUT.cl b/modules/ocl/src/opencl/arithm_LUT.cl
index 624da0008..ff21e9a31 100644
--- a/modules/ocl/src/opencl/arithm_LUT.cl
+++ b/modules/ocl/src/opencl/arithm_LUT.cl
@@ -38,125 +38,66 @@
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
 #endif
 
-__kernel
-void LUT_C1_D0( __global uchar *dst,
-      __global const uchar *src,
-      __constant uchar *table,
-      int rows,
-      int cols,
-      int channels,
-      int whole_rows,
-      int whole_cols,
-      int src_offset,
-      int dst_offset,
-      int lut_offset,
-      int src_step,
-      int dst_step)
+__kernel void LUT_C1( __global const srcT * src, __global const dstT *lut,
+      __global dstT *dst,
+      int cols1, int rows,
+      int src_offset1,
+      int lut_offset1,
+      int dst_offset1,
+      int src_step1, int dst_step1)
 {
-    int gidx = get_global_id(0)<<2;
-    int gidy = get_global_id(1);
-    int lidx = get_local_id(0);
-    int lidy = get_local_id(1);
+    int x1 = get_global_id(0);
+    int y = get_global_id(1);
 
-    __local uchar l[256];
-    l[(lidy<<4)+lidx] = table[(lidy<<4)+lidx+lut_offset];
-    //mem_fence(CLK_LOCAL_MEM_FENCE);
-
-
-    //clamp(gidx,mask,cols-1);
-    gidx = gidx >= cols-4?cols-4:gidx;
-    gidy = gidy >= rows?rows-1:gidy;
-
-    int src_index = src_offset + mad24(gidy,src_step,gidx);
-    int dst_index = dst_offset + mad24(gidy,dst_step,gidx);
-    uchar4 p,q;
-    barrier(CLK_LOCAL_MEM_FENCE);
-    p.x = src[src_index];
-    p.y = src[src_index+1];
-    p.z = src[src_index+2];
-    p.w = src[src_index+3];
-
-    q.x = l[p.x];
-    q.y = l[p.y];
-    q.z = l[p.z];
-    q.w = l[p.w];
-    *(__global uchar4*)(dst + dst_index) = q;
-}
-
-__kernel
-void LUT2_C1_D0( __global uchar *dst,
-      __global const uchar *src,
-      __constant uchar *table,
-      int rows,
-      int precols,
-      int channels,
-      int whole_rows,
-      int cols,
-      int src_offset,
-      int dst_offset,
-      int lut_offset,
-      int src_step,
-      int dst_step)
-{
-    int gidx = get_global_id(0);
-    int gidy = get_global_id(1);
-    //int lidx = get_local_id(0);
-    int lidy = get_local_id(1);
-
-    __local uchar l[256];
-    l[lidy] = table[lidy+lut_offset];
-    //mem_fence(CLK_LOCAL_MEM_FENCE);
-
-
-    //clamp(gidx,mask,cols-1);
-    gidx = gidx >= precols ? cols+gidx : gidx;
-    gidy = gidy >= rows?rows-1:gidy;
-
-    int src_index = src_offset + mad24(gidy,src_step,gidx);
-    int dst_index = dst_offset + mad24(gidy,dst_step,gidx);
-    //uchar4 p,q;
-    barrier(CLK_LOCAL_MEM_FENCE);
-    uchar p = src[src_index];
-    uchar q = l[p];
-    dst[dst_index] = q;
-}
-
-__kernel
-void LUT_C4_D0( __global uchar4 *dst,
-      __global uchar4 *src,
-      __constant uchar *table,
-      int rows,
-      int cols,
-      int channels,
-      int whole_rows,
-      int whole_cols,
-      int src_offset,
-      int dst_offset,
-      int lut_offset,
-      int src_step,
-      int dst_step)
-{
-    int gidx = get_global_id(0);
-    int gidy = get_global_id(1);
-
-    int lidx = get_local_id(0);
-    int lidy = get_local_id(1);
-
-    int src_index = mad24(gidy,src_step,gidx+src_offset);
-    int dst_index = mad24(gidy,dst_step,gidx+dst_offset);
-    __local uchar l[256];
-    l[lidy*16+lidx] = table[lidy*16+lidx+lut_offset];
-    //mem_fence(CLK_LOCAL_MEM_FENCE);
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(gidx<cols && gidy<rows)
+    if (x1 < cols1 && y < rows)
     {
-        uchar4 p = src[src_index];
-        uchar4 q;
-        q.x = l[p.x];
-        q.y = l[p.y];
-        q.z = l[p.z];
-        q.w = l[p.w];
-        dst[dst_index] = q;
+        int src_index = mad24(y, src_step1, src_offset1 + x1);
+        int dst_index = mad24(y, dst_step1, dst_offset1 + x1);
+
+        dst[dst_index] = lut[lut_offset1 + src[src_index]];
+    }
+}
+
+__kernel void LUT_C2( __global const srcT * src, __global const dstT *lut,
+      __global dstT *dst,
+      int cols1, int rows,
+      int src_offset1,
+      int lut_offset1,
+      int dst_offset1,
+      int src_step1, int dst_step1)
+{
+    int x1 = get_global_id(0) << 1;
+    int y = get_global_id(1);
+
+    if (x1 < cols1 && y < rows)
+    {
+        int src_index = mad24(y, src_step1, src_offset1 + x1);
+        int dst_index = mad24(y, dst_step1, dst_offset1 + x1);
+
+        dst[dst_index    ] =                  lut[lut_offset1 + (src[src_index    ] << 1)    ];
+        dst[dst_index + 1] = x1 + 1 < cols1 ? lut[lut_offset1 + (src[src_index + 1] << 1) + 1] : dst[dst_index + 1];
+    }
+}
+
+__kernel void LUT_C4( __global const srcT * src, __global const dstT *lut,
+      __global dstT *dst,
+      int cols1, int rows,
+      int src_offset1,
+      int lut_offset1,
+      int dst_offset1,
+      int src_step1, int dst_step1)
+{
+    int x1 = get_global_id(0) << 2;
+    int y = get_global_id(1);
+
+    if (x1 < cols1 && y < rows)
+    {
+        int src_index = mad24(y, src_step1, src_offset1 + x1);
+        int dst_index = mad24(y, dst_step1, dst_offset1 + x1);
+
+        dst[dst_index    ] =                  lut[lut_offset1 + (src[src_index    ] << 2)    ];
+        dst[dst_index + 1] = x1 + 1 < cols1 ? lut[lut_offset1 + (src[src_index + 1] << 2) + 1] : dst[dst_index + 1];
+        dst[dst_index + 2] = x1 + 2 < cols1 ? lut[lut_offset1 + (src[src_index + 2] << 2) + 2] : dst[dst_index + 2];
+        dst[dst_index + 3] = x1 + 3 < cols1 ? lut[lut_offset1 + (src[src_index + 3] << 2) + 3] : dst[dst_index + 3];
     }
 }

From 161674bff24d49e7d8173a6f5505ea616b598dbc Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Tue, 24 Sep 2013 13:51:37 +0400
Subject: [PATCH 07/17] refactored and extended ocl::addWeighted

---
 modules/ocl/src/arithm.cpp                   |  78 ++--
 modules/ocl/src/opencl/arithm_addWeighted.cl | 384 +------------------
 2 files changed, 53 insertions(+), 409 deletions(-)

diff --git a/modules/ocl/src/arithm.cpp b/modules/ocl/src/arithm.cpp
index 66180ba4d..8441d8e4b 100644
--- a/modules/ocl/src/arithm.cpp
+++ b/modules/ocl/src/arithm.cpp
@@ -1795,64 +1795,66 @@ void cv::ocl::transpose(const oclMat &src, oclMat &dst)
 
 void cv::ocl::addWeighted(const oclMat &src1, double alpha, const oclMat &src2, double beta, double gama, oclMat &dst)
 {
-    dst.create(src1.size(), src1.type());
-    CV_Assert(src1.cols ==  src2.cols && src2.cols == dst.cols &&
-              src1.rows ==  src2.rows && src2.rows == dst.rows);
-    CV_Assert(src1.type() == src2.type() && src1.type() == dst.type());
-
     Context *clCxt = src1.clCxt;
+    bool hasDouble = clCxt->supportsFeature(Context::CL_DOUBLE);
+    if (!hasDouble && src1.depth() == CV_64F)
+    {
+        CV_Error(CV_GpuNotSupported, "Selected device doesn't support double\r\n");
+        return;
+    }
+
+    CV_Assert(src1.size() ==  src2.size() && src1.type() == src2.type());
+    dst.create(src1.size(), src1.type());
+
     int channels = dst.oclchannels();
     int depth = dst.depth();
 
+    int cols1 = src1.cols * channels;
+    int src1step1 = src1.step1(), src1offset1 = src1.offset / src1.elemSize1();
+    int src2step1 = src2.step1(), src2offset1 = src2.offset / src1.elemSize1();
+    int dststep1 = dst.step1(), dstoffset1 = dst.offset / dst.elemSize1();
 
-    int vector_lengths[4][7] = {{4, 0, 4, 4, 4, 4, 4},
-        {4, 0, 4, 4, 4, 4, 4},
-        {4, 0, 4, 4, 4, 4, 4},
-        {4, 0, 4, 4, 4, 4, 4}
-    };
-
-
-    size_t vector_length = vector_lengths[channels - 1][depth];
-    int offset_cols = (dst.offset / dst.elemSize1()) & (vector_length - 1);
-    int cols = divUp(dst.cols * channels + offset_cols, vector_length);
+    const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
+    std::string buildOptions = format("-D T=%s -D WT=%s -D convertToT=convert_%s%s",
+                                      typeMap[depth], hasDouble ? "double" : "float", typeMap[depth],
+                                      depth >= CV_32F ? "" : "_sat_rte");
 
     size_t localThreads[3]  = { 256, 1, 1 };
-    size_t globalThreads[3] = { cols, dst.rows, 1};
+    size_t globalThreads[3] = { cols1, dst.rows, 1};
+
+    float alpha_f = static_cast<float>(alpha),
+            beta_f = static_cast<float>(beta),
+            gama_f = static_cast<float>(gama);
 
-    int dst_step1 = dst.cols * dst.elemSize();
-    int src1_step = (int) src1.step;
-    int src2_step = (int) src2.step;
-    int dst_step  = (int) dst.step;
-    float alpha_f = alpha, beta_f = beta, gama_f = gama;
     vector<pair<size_t , const void *> > args;
     args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src1_step ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src1.offset));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src1step1 ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src1offset1));
     args.push_back( make_pair( sizeof(cl_mem), (void *)&src2.data ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src2_step ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src2.offset));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src2step1 ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src2offset1));
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&dststep1 ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&dstoffset1));
 
-    if(src1.clCxt->supportsFeature(Context::CL_DOUBLE))
-    {
-        args.push_back( make_pair( sizeof(cl_double), (void *)&alpha ));
-        args.push_back( make_pair( sizeof(cl_double), (void *)&beta ));
-        args.push_back( make_pair( sizeof(cl_double), (void *)&gama ));
-    }
-    else
+    if (!hasDouble)
     {
         args.push_back( make_pair( sizeof(cl_float), (void *)&alpha_f ));
         args.push_back( make_pair( sizeof(cl_float), (void *)&beta_f ));
         args.push_back( make_pair( sizeof(cl_float), (void *)&gama_f ));
     }
+    else
+    {
+        args.push_back( make_pair( sizeof(cl_double), (void *)&alpha ));
+        args.push_back( make_pair( sizeof(cl_double), (void *)&beta ));
+        args.push_back( make_pair( sizeof(cl_double), (void *)&gama ));
+    }
 
-    args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&cols1 ));
     args.push_back( make_pair( sizeof(cl_int), (void *)&src1.rows ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&cols ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 ));
 
-    openCLExecuteKernel(clCxt, &arithm_addWeighted, "addWeighted", globalThreads, localThreads, args, -1, depth);
+    openCLExecuteKernel(clCxt, &arithm_addWeighted, "addWeighted", globalThreads, localThreads,
+                        args, -1, -1, buildOptions.c_str());
 }
 
 static void arithmetic_pow_run(const oclMat &src1, double p, oclMat &dst, string kernelName, const char **kernelString)
diff --git a/modules/ocl/src/opencl/arithm_addWeighted.cl b/modules/ocl/src/opencl/arithm_addWeighted.cl
index e7ed28928..159a970db 100644
--- a/modules/ocl/src/opencl/arithm_addWeighted.cl
+++ b/modules/ocl/src/opencl/arithm_addWeighted.cl
@@ -42,392 +42,34 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
+
 #if defined (DOUBLE_SUPPORT)
 #ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
 #elif defined (cl_amd_fp64)
 #pragma OPENCL EXTENSION cl_amd_fp64:enable
 #endif
-typedef double F;
-#else
-typedef float F;
 #endif
+
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 /////////////////////////////////////////////addWeighted//////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
-__kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset,
-                              __global uchar *src2, int src2_step,int src2_offset,
-                              F alpha,F beta,F gama,
-                              __global uchar *dst,  int dst_step,int dst_offset,
-                              int rows,  int cols,int dst_step1)
+
+__kernel void addWeighted(__global T * src1, int src1_step1, int src1_offset1,
+                              __global T * src2, int src2_step1, int src2_offset1,
+                              __global T * dst, int dst_step1, int dst_offset1,
+                              WT alpha, WT beta, WT gama,
+                              int cols1, int rows)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
 
-    if (x < cols && y < rows)
-
+    if (x < cols1 && y < rows)
     {
+        int src1_index = mad24(y, src1_step1, x + src1_offset1);
+        int src2_index = mad24(y, src2_step1, x + src2_offset1);
+        int dst_index = mad24(y, dst_step1, x + dst_offset1);
 
-        x = x << 2;
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        uchar4 src1_data ,src2_data;
-
-        src1_data.x= src1_index+0 >= 0 ? src1[src1_index+0] : 0;
-        src1_data.y= src1_index+1 >= 0 ? src1[src1_index+1] : 0;
-        src1_data.z= src1_index+2 >= 0 ? src1[src1_index+2] : 0;
-        src1_data.w= src1_index+3 >= 0 ? src1[src1_index+3] : 0;
-
-        src2_data.x= src2_index+0 >= 0 ? src2[src2_index+0] : 0;
-        src2_data.y= src2_index+1 >= 0 ? src2[src2_index+1] : 0;
-        src2_data.z= src2_index+2 >= 0 ? src2[src2_index+2] : 0;
-        src2_data.w= src2_index+3 >= 0 ? src2[src2_index+3] : 0;
-
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-//        short4 tmp      = convert_short4_sat(src1_data) * alpha + convert_short4_sat(src2_data) * beta + gama;
-        short4 tmp;
-        tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
-        tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
-        tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
-        tmp.w = src1_data.w * alpha + src2_data.w * beta + gama;
-        uchar4 tmp_data = convert_uchar4_sat(tmp);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-        // dst[x + y * dst_step] = src1[x + y * src1_step] * alpha + src2[x + y * src2_step] * beta + gama;
+        dst[dst_index] = convertToT(src1[src1_index]*alpha + src2[src2_index]*beta + gama);
     }
-
 }
-
-
-
-__kernel void addWeighted_D2 (__global ushort *src1, int src1_step,int src1_offset,
-                              __global ushort *src2, int src2_step,int src2_offset,
-                              F alpha,F beta,F gama,
-                              __global ushort *dst,  int dst_step,int dst_offset,
-                              int rows,  int cols,int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-
-    {
-
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset +( x<< 1) & (int)0xfffffff8);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index_fix));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix));
-        if(src1_index < 0)
-        {
-            ushort4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            ushort4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-        ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
-        // int4 tmp      = convert_int4_sat(src1_data) * alpha + convert_int4_sat(src2_data) * beta + gama;
-        int4 tmp;
-        tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
-        tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
-        tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
-        tmp.w = src1_data.w * alpha + src2_data.w * beta + gama;
-        ushort4 tmp_data = convert_ushort4_sat(tmp);
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global ushort4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-
-
-}
-
-
-__kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offset,
-                              __global short *src2,  int src2_step,int src2_offset,
-                              F alpha,F beta,F gama,
-                              __global short *dst,  int dst_step,int dst_offset,
-                              int rows,  int cols,int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-
-    {
-
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset +( x<< 1) - (dst_align << 1 ));
-
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index_fix));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix));
-
-        if(src1_index < 0)
-        {
-            short4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            short4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-        short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
-        // int4 tmp      = convert_int4_sat(src1_data) * alpha + convert_int4_sat(src2_data) * beta + gama;
-        int4 tmp;
-        tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
-        tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
-        tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
-        tmp.w = src1_data.w * alpha + src2_data.w * beta + gama;
-        short4 tmp_data = convert_short4_sat(tmp);
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-
-}
-
-
-__kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
-                              __global int *src2, int src2_step,int src2_offset,
-                              F alpha,F beta, F gama,
-                              __global int *dst,  int dst_step,int dst_offset,
-                              int rows,  int cols,int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-
-    {
-
-        x = x << 2;
-
-#define bitOfInt  (sizeof(int)== 4 ? 2: 3)
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> bitOfInt) & 3)
-
-        int src1_index = mad24(y, src1_step, (x << bitOfInt) + src1_offset - (dst_align << bitOfInt));
-        int src2_index = mad24(y, src2_step, (x << bitOfInt) + src2_offset - (dst_align << bitOfInt));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << bitOfInt) -(dst_align << bitOfInt));
-
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index_fix));
-        int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index_fix));
-
-        if(src1_index < 0)
-        {
-            int4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            int4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-        int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index));
-        // double4   tmp = convert_double4(src1_data) * alpha + convert_double4(src2_data) * beta + gama ;
-        float4 tmp;
-        tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
-        tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
-        tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
-        tmp.w = src1_data.w * alpha + src2_data.w * beta + gama;
-        int4 tmp_data = convert_int4_sat(tmp);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 8 >= dst_start) && (dst_index + 8 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 12 >= dst_start) && (dst_index + 12 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global int4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-
-}
-
-
-__kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset,
-                              __global float *src2, int src2_step,int src2_offset,
-                              F alpha,F beta, F gama,
-                              __global float *dst,  int dst_step,int dst_offset,
-                              int rows,  int cols,int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-
-    {
-
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 2) & 3)
-
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
-
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        float4 src1_data = vload4(0, (__global float  *)((__global char *)src1 + src1_index_fix));
-        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
-        float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index));
-        if(src1_index < 0)
-        {
-            float4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            float4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-        //    double4   tmp = convert_double4(src1_data) * alpha + convert_double4(src2_data) * beta + gama ;
-
-        // float4   tmp_data =(src1_data) * alpha + (src2_data) * beta + gama ;
-        float4 tmp_data;
-        tmp_data.x = src1_data.x * alpha + src2_data.x * beta + gama;
-        tmp_data.y = src1_data.y * alpha + src2_data.y * beta + gama;
-        tmp_data.z = src1_data.z * alpha + src2_data.z * beta + gama;
-        tmp_data.w = src1_data.w * alpha + src2_data.w * beta + gama;
-        // float4 tmp_data = convert_float4(tmp);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 8 >= dst_start) && (dst_index + 8 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 12 >= dst_start) && (dst_index + 12 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global float4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offset,
-                              __global double *src2, int src2_step,int src2_offset,
-                              F alpha,F beta, F gama,
-                              __global double *dst,  int dst_step,int dst_offset,
-                              int rows,  int cols,int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-
-    {
-
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 3) & 3)
-
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 3) -(dst_align << 3));
-
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        double4 src1_data = vload4(0, (__global double  *)((__global char *)src1 + src1_index_fix));
-        double4 src2_data = vload4(0, (__global double  *)((__global char *)src2 + src2_index_fix));
-        double4 dst_data = *((__global double4 *)((__global char *)dst + dst_index));
-        if(src1_index < 0)
-        {
-            double4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            double4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-        //  double4   tmp_data = (src1_data) * alpha + (src2_data) * beta + gama ;
-        double4 tmp_data;
-        tmp_data.x = src1_data.x * alpha + src2_data.x * beta + gama;
-        tmp_data.y = src1_data.y * alpha + src2_data.y * beta + gama;
-        tmp_data.z = src1_data.z * alpha + src2_data.z * beta + gama;
-        tmp_data.w = src1_data.w * alpha + src2_data.w * beta + gama;
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 8 >= dst_start) && (dst_index + 8 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 16 >= dst_start) && (dst_index + 16 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 24 >= dst_start) && (dst_index + 24 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global double4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-
-}
-#endif

From 8e0e352d778e25f3e6a851d15f985ff133ad1597 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Tue, 24 Sep 2013 13:54:46 +0400
Subject: [PATCH 08/17] refactored and extended binary bitwise operations

---
 modules/ocl/src/arithm.cpp                    | 347 ++------
 .../ocl/src/opencl/arithm_bitwise_binary.cl   | 298 +------
 .../src/opencl/arithm_bitwise_binary_mask.cl  | 766 +-----------------
 .../opencl/arithm_bitwise_binary_scalar.cl    | 592 +-------------
 .../arithm_bitwise_binary_scalar_mask.cl      | 694 +---------------
 5 files changed, 119 insertions(+), 2578 deletions(-)

diff --git a/modules/ocl/src/arithm.cpp b/modules/ocl/src/arithm.cpp
index 8441d8e4b..4f6737e82 100644
--- a/modules/ocl/src/arithm.cpp
+++ b/modules/ocl/src/arithm.cpp
@@ -1290,7 +1290,8 @@ int cv::ocl::countNonZero(const oclMat &src)
 //////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////bitwise_op////////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////
-static void bitwise_run(const oclMat &src1, oclMat &dst, string kernelName, const char **kernelString)
+
+static void bitwise_unary_run(const oclMat &src1, oclMat &dst, string kernelName, const char **kernelString)
 {
     dst.create(src1.size(), src1.type());
 
@@ -1327,331 +1328,123 @@ static void bitwise_run(const oclMat &src1, oclMat &dst, string kernelName, cons
     openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth);
 }
 
+enum { AND = 0, OR, XOR };
 
-template<typename T>
-void bitwise_run(const oclMat &src1, const oclMat &src2, oclMat &dst, string kernelName,
- const char **kernelString, void *_scalar, const char* _opt = NULL)
+static void bitwise_binary_run(const oclMat &src1, const oclMat &src2, const Scalar& src3, const oclMat &mask,
+                               oclMat &dst, int operationType)
 {
-    dst.create(src1.size(), src1.type());
-    CV_Assert(src1.cols == src2.cols && src2.cols == dst.cols &&
-              src1.rows == src2.rows && src2.rows == dst.rows);
-
-    CV_Assert(src1.type() == src2.type() && src1.type() == dst.type());
-
     Context  *clCxt = src1.clCxt;
-    int channels = dst.oclchannels();
-    int depth = dst.depth();
+    if (!clCxt->supportsFeature(Context::CL_DOUBLE) && src1.depth() == CV_64F)
+    {
+        cout << "Selected device does not support double" << endl;
+        return;
+    }
 
-    int vector_lengths[4][7] = {{4, 4, 4, 4, 1, 1, 1},
-        {4, 4, 4, 4, 1, 1, 1},
-        {4, 4, 4, 4, 1, 1, 1},
-        {4, 4, 4, 4, 1, 1, 1}
-    };
+    CV_Assert(operationType >= AND && operationType <= XOR);
+    CV_Assert(src2.empty() || (!src2.empty() && src1.type() == src2.type() && src1.size() == src2.size()));
+    CV_Assert(mask.empty() || (!mask.empty() && mask.type() == CV_8UC1 && mask.size() == src1.size()));
 
-    size_t vector_length = vector_lengths[channels - 1][depth];
-    int offset_cols = (dst.offset / dst.elemSize1()) & (vector_length - 1);
-    int cols = divUp(dst.cols * channels + offset_cols, vector_length);
+    dst.create(src1.size(), src1.type());
 
-    size_t localThreads[3]  = { 64, 4, 1 };
-    size_t globalThreads[3] = { cols, dst.rows, 1 };
+    int elemSize = dst.elemSize();
+    int cols1 = dst.cols * elemSize;
+    oclMat m;
+
+    const char operationMap[] = { '&', '|', '^' };
+    std::string kernelName("arithm_bitwise_binary");
+    std::string buildOptions = format("-D Operation=%c", operationMap[operationType]);
+
+    size_t localThreads[3]  = { 16, 16, 1 };
+    size_t globalThreads[3] = { cols1, dst.rows, 1 };
 
-    int dst_step1 = dst.cols * dst.elemSize();
     vector<pair<size_t , const void *> > args;
     args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data ));
     args.push_back( make_pair( sizeof(cl_int), (void *)&src1.step ));
     args.push_back( make_pair( sizeof(cl_int), (void *)&src1.offset ));
-    args.push_back( make_pair( sizeof(cl_mem), (void *)&src2.data ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src2.step ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src2.offset ));
+
+    if (src2.empty())
+    {
+        m.create(1, 1, dst.type());
+        m.setTo(src3);
+
+        args.push_back( make_pair( sizeof(cl_mem), (void *)&m.data ));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&elemSize ) );
+
+        kernelName += "_scalar";
+    }
+    else
+    {
+        args.push_back( make_pair( sizeof(cl_mem), (void *)&src2.data ));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&src2.step ));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&src2.offset ));
+    }
+
+    if (!mask.empty())
+    {
+        args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data ));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&mask.step ));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&mask.offset ));
+
+        if (!src2.empty())
+            args.push_back( make_pair( sizeof(cl_int), (void *)&elemSize ));
+
+        kernelName += "_mask";
+    }
+
     args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
     args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step ));
     args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset ));
+
+    args.push_back( make_pair( sizeof(cl_int), (void *)&cols1 ));
     args.push_back( make_pair( sizeof(cl_int), (void *)&src1.rows ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&cols ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 ));
 
-    T scalar;
-    if(_scalar != NULL)
-    {
-        double scalar1 = *((double *)_scalar);
-        scalar = (T)scalar1;
-        args.push_back( make_pair( sizeof(T), (void *)&scalar ));
-    }
-
-    openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth, _opt);
-}
-static void bitwise_run(const oclMat &src1, const oclMat &src2, oclMat &dst,
- string kernelName, const char **kernelString, const char* _opt = NULL)
-{
-    bitwise_run<char>(src1, src2, dst, kernelName, kernelString, (void *)NULL, _opt);
-}
-static void bitwise_run(const oclMat &src1, const oclMat &src2, oclMat &dst,
- const oclMat &mask, string kernelName, const char **kernelString, const char* _opt = NULL)
-{
-    dst.create(src1.size(), src1.type());
-    CV_Assert(src1.cols == src2.cols && src2.cols == dst.cols &&
-              src1.rows == src2.rows && src2.rows == dst.rows &&
-              src1.rows == mask.rows && src1.cols == mask.cols);
-
-    CV_Assert(src1.type() == src2.type() && src1.type() == dst.type());
-    CV_Assert(mask.type() == CV_8U);
-
-    Context  *clCxt = src1.clCxt;
-    int channels = dst.oclchannels();
-    int depth = dst.depth();
-
-    int vector_lengths[4][7] = {{4, 4, 2, 2, 1, 1, 1},
-        {2, 2, 1, 1, 1, 1, 1},
-        {4, 4, 2, 2 , 1, 1, 1},
-        {1, 1, 1, 1, 1, 1, 1}
-    };
-
-    size_t vector_length = vector_lengths[channels - 1][depth];
-    int offset_cols = ((dst.offset % dst.step) / dst.elemSize()) & (vector_length - 1);
-    int cols = divUp(dst.cols + offset_cols, vector_length);
-
-    size_t localThreads[3]  = { 64, 4, 1 };
-    size_t globalThreads[3] = { cols, dst.rows, 1 };
-
-    int dst_step1 = dst.cols * dst.elemSize();
-    vector<pair<size_t , const void *> > args;
-    args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src1.step ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src1.offset ));
-    args.push_back( make_pair( sizeof(cl_mem), (void *)&src2.data ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src2.step ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src2.offset ));
-    args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&mask.step ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&mask.offset ));
-    args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src1.rows ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&cols ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 ));
-
-    openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, channels, depth, _opt);
-}
-
-
-template <typename WT , typename CL_WT>
-void bitwise_scalar_run(const oclMat &src1, const Scalar &src2, oclMat &dst,
- const oclMat &mask, string kernelName, const char **kernelString, int isMatSubScalar, const char* opt = NULL)
-{
-    dst.create(src1.size(), src1.type());
-
-    CV_Assert(src1.cols == dst.cols && src1.rows == dst.rows &&
-              src1.type() == dst.type());
-
-
-    if(mask.data)
-    {
-        CV_Assert(mask.type() == CV_8U && src1.rows == mask.rows && src1.cols == mask.cols);
-    }
-
-    Context  *clCxt = src1.clCxt;
-    int channels = dst.oclchannels();
-    int depth = dst.depth();
-
-    WT s[4] = { saturate_cast<WT>(src2.val[0]), saturate_cast<WT>(src2.val[1]),
-                saturate_cast<WT>(src2.val[2]), saturate_cast<WT>(src2.val[3])
-              };
-
-    int vector_lengths[4][7] = {{4, 4, 2, 2, 1, 1, 1},
-        {2, 2, 1, 1, 1, 1, 1},
-        {4, 4, 2, 2 , 1, 1, 1},
-        {1, 1, 1, 1, 1, 1, 1}
-    };
-
-    size_t vector_length = vector_lengths[channels - 1][depth];
-    int offset_cols = ((dst.offset % dst.step) / dst.elemSize()) & (vector_length - 1);
-    int cols = divUp(dst.cols + offset_cols, vector_length);
-
-    size_t localThreads[3]  = { 64, 4, 1 };
-    size_t globalThreads[3] = { cols, dst.rows, 1 };
-
-    int dst_step1 = dst.cols * dst.elemSize();
-    vector<pair<size_t , const void *> > args;
-    args.push_back( make_pair( sizeof(cl_mem) , (void *)&src1.data ));
-    args.push_back( make_pair( sizeof(cl_int) , (void *)&src1.step ));
-    args.push_back( make_pair( sizeof(cl_int) , (void *)&src1.offset));
-    args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data ));
-    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.step ));
-    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.offset));
-
-    if(mask.data)
-    {
-        args.push_back( make_pair( sizeof(cl_mem) , (void *)&mask.data ));
-        args.push_back( make_pair( sizeof(cl_int) , (void *)&mask.step ));
-        args.push_back( make_pair( sizeof(cl_int) , (void *)&mask.offset));
-    }
-    args.push_back( make_pair( sizeof(CL_WT) , (void *)&s ));
-    args.push_back( make_pair( sizeof(cl_int) , (void *)&src1.rows ));
-    args.push_back( make_pair( sizeof(cl_int) , (void *)&cols ));
-    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_step1 ));
-    if(isMatSubScalar != 0)
-    {
-        isMatSubScalar = isMatSubScalar > 0 ? 1 : 0;
-        args.push_back( make_pair( sizeof(cl_int) , (void *)&isMatSubScalar));
-    }
-
-    openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, channels, depth, opt);
-}
-
-
-typedef void (*BitwiseFuncS)(const oclMat &src1, const Scalar &src2, oclMat &dst,
- const oclMat &mask, string kernelName, const char **kernelString, int isMatSubScalar, const char* opt);
-
-
-static void bitwise_scalar(const oclMat &src1, const Scalar &src2, oclMat &dst,
- const oclMat &mask, string kernelName, const char **kernelString, int isMatSubScalar, const char* opt)
-{
-    static BitwiseFuncS tab[8] =
-    {
-#if 0
-        bitwise_scalar_run<unsigned char>,
-        bitwise_scalar_run<char>,
-        bitwise_scalar_run<unsigned short>,
-        bitwise_scalar_run<short>,
-        bitwise_scalar_run<int>,
-        bitwise_scalar_run<float>,
-        bitwise_scalar_run<double>,
-        0
-#else
-
-        bitwise_scalar_run<unsigned char, cl_uchar4>,
-        bitwise_scalar_run<char, cl_char4>,
-        bitwise_scalar_run<unsigned short, cl_ushort4>,
-        bitwise_scalar_run<short, cl_short4>,
-        bitwise_scalar_run<int, cl_int4>,
-        bitwise_scalar_run<float, cl_float4>,
-        bitwise_scalar_run<double, cl_double4>,
-        0
-#endif
-    };
-    BitwiseFuncS func = tab[src1.depth()];
-    if(func == 0)
-        cv::ocl::error("Unsupported arithmetic operation", __FILE__, __LINE__);
-    func(src1, src2, dst, mask, kernelName, kernelString, isMatSubScalar, opt);
-}
-static void bitwise_scalar(const oclMat &src1, const Scalar &src2, oclMat &dst,
- const oclMat &mask, string kernelName, const char **kernelString, const char * opt = NULL)
-{
-    bitwise_scalar(src1, src2, dst, mask, kernelName, kernelString, 0, opt);
+    openCLExecuteKernel(clCxt, mask.empty() ? (!src2.empty() ? &arithm_bitwise_binary : &arithm_bitwise_binary_scalar) :
+                                              (!src2.empty() ? &arithm_bitwise_binary_mask : &arithm_bitwise_binary_scalar_mask),
+                        kernelName, globalThreads, localThreads,
+                        args, -1, -1, buildOptions.c_str());
 }
 
 void cv::ocl::bitwise_not(const oclMat &src, oclMat &dst)
 {
-    if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F)
+    if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F)
     {
-        cout << "Selected device do not support double" << endl;
+        cout << "Selected device does not support double" << endl;
         return;
     }
+
     dst.create(src.size(), src.type());
     string kernelName =  "arithm_bitwise_not";
-    bitwise_run(src, dst, kernelName, &arithm_bitwise_not);
+    bitwise_unary_run(src, dst, kernelName, &arithm_bitwise_not);
 }
 
 void cv::ocl::bitwise_or(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask)
 {
-    // dst.create(src1.size(),src1.type());
-    if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
-    {
-        cout << "Selected device do not support double" << endl;
-        return;
-    }
-
-    string kernelName = mask.empty() ? "arithm_bitwise_binary" : "arithm_bitwise_binary_with_mask";
-    static const char opt [] = "-D OP_BINARY=|";
-    if (mask.empty())
-        bitwise_run(src1, src2, dst, kernelName, &arithm_bitwise_binary, opt);
-    else
-        bitwise_run(src1, src2, dst, mask, kernelName, &arithm_bitwise_binary_mask, opt);
+    bitwise_binary_run(src1, src2, Scalar(), mask, dst, OR);
 }
 
-
 void cv::ocl::bitwise_or(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask)
 {
-    if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
-    {
-        cout << "Selected device do not support double" << endl;
-        return;
-    }
-    static const char opt [] = "-D OP_BINARY=|";
-    string kernelName = mask.data ? "arithm_s_bitwise_binary_with_mask" : "arithm_s_bitwise_binary";
-    if (mask.data)
-        bitwise_scalar( src1, src2, dst, mask, kernelName, &arithm_bitwise_binary_scalar_mask, opt);
-    else
-        bitwise_scalar( src1, src2, dst, mask, kernelName, &arithm_bitwise_binary_scalar, opt);
+    bitwise_binary_run(src1, oclMat(), src2, mask, dst, OR);
 }
 
 void cv::ocl::bitwise_and(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask)
 {
-    //    dst.create(src1.size(),src1.type());
-    if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
-    {
-        cout << "Selected device do not support double" << endl;
-        return;
-    }
-    oclMat emptyMat;
-
-    string kernelName = mask.empty() ? "arithm_bitwise_binary" : "arithm_bitwise_binary_with_mask";
-
-    static const char opt [] = "-D OP_BINARY=&";
-    if (mask.empty())
-        bitwise_run(src1, src2, dst, kernelName, &arithm_bitwise_binary, opt);
-    else
-        bitwise_run(src1, src2, dst, mask, kernelName, &arithm_bitwise_binary_mask, opt);
+    bitwise_binary_run(src1, src2, Scalar(), mask, dst, AND);
 }
 
 void cv::ocl::bitwise_and(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask)
 {
-    if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
-    {
-        cout << "Selected device do not support double" << endl;
-        return;
-    }
-    static const char opt [] = "-D OP_BINARY=&";
-    string kernelName = mask.data ? "arithm_s_bitwise_binary_with_mask" : "arithm_s_bitwise_binary";
-    if (mask.data)
-        bitwise_scalar(src1, src2, dst, mask, kernelName, &arithm_bitwise_binary_scalar_mask, opt);
-    else
-        bitwise_scalar(src1, src2, dst, mask, kernelName, &arithm_bitwise_binary_scalar, opt);
+    bitwise_binary_run(src1, oclMat(), src2, mask, dst, AND);
 }
 
 void cv::ocl::bitwise_xor(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask)
 {
-    if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
-    {
-        cout << "Selected device do not support double" << endl;
-        return;
-    }
-    string kernelName = mask.empty() ? "arithm_bitwise_binary" : "arithm_bitwise_binary_with_mask";
-
-    static const char opt [] = "-D OP_BINARY=^";
-
-    if (mask.empty())
-        bitwise_run(src1, src2, dst, kernelName, &arithm_bitwise_binary, opt);
-    else
-        bitwise_run(src1, src2, dst, mask, kernelName, &arithm_bitwise_binary_mask, opt);
+    bitwise_binary_run(src1, src2, Scalar(), mask, dst, XOR);
 }
 
-
 void cv::ocl::bitwise_xor(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask)
 {
-
-    if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
-    {
-        cout << "Selected device do not support double" << endl;
-        return;
-    }
-    string kernelName = mask.data ? "arithm_s_bitwise_binary_with_mask" : "arithm_s_bitwise_binary";
-    static const char opt [] = "-D OP_BINARY=^";
-    if (mask.data)
-        bitwise_scalar( src1, src2, dst, mask, kernelName, &arithm_bitwise_binary_scalar_mask, opt);
-    else
-        bitwise_scalar( src1, src2, dst, mask, kernelName, &arithm_bitwise_binary_scalar, opt);
+    bitwise_binary_run(src1, oclMat(), src2, mask, dst, XOR);
 }
 
 oclMat cv::ocl::operator ~ (const oclMat &src)
diff --git a/modules/ocl/src/opencl/arithm_bitwise_binary.cl b/modules/ocl/src/opencl/arithm_bitwise_binary.cl
index 8bdd23c17..898b40a9e 100644
--- a/modules/ocl/src/opencl/arithm_bitwise_binary.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_binary.cl
@@ -43,303 +43,25 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-#if defined (DOUBLE_SUPPORT)
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#elif defined (cl_amd_fp64)
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#endif
-#endif
-
-//bitwise_binary without mask for and, or, xor operators
 
 /////////////////////////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////bitwise_binary///////////////////////////////////////////
+/////////////////////////////////////////// bitwise_binary //////////////////////////////////////////
 /////////////////////////////////////////////////////////////////////////////////////////////////////
 
-#ifndef OP_BINARY
-#define OP_BINARY &
-#endif
-
-__kernel void arithm_bitwise_binary_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                     __global uchar *src2, int src2_step, int src2_offset,
-                                     __global uchar *dst,  int dst_step,  int dst_offset,
-                                     int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_binary(__global uchar * src1, int src1_step, int src1_offset,
+                                    __global uchar * src2, int src2_step, int src2_offset,
+                                    __global uchar * dst, int dst_step, int dst_offset,
+                                    int cols1, int rows)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
 
-    if (x < cols && y < rows)
+    if (x < cols1 && y < rows)
     {
-        x = x << 2;
+        int src1_index = mad24(y, src1_step, x + src1_offset);
+        int src2_index = mad24(y, src2_step, x + src2_offset);
+        int dst_index = mad24(y, dst_step, dst_offset + x);
 
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
-        uchar4 src2_data = vload4(0, src2 + src2_index_fix);
-
-        if(src1_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = src1_data OP_BINARY src2_data;
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
+        dst[dst_index] = src1[src1_index] Operation src2[src2_index];
     }
 }
-
-
-__kernel void arithm_bitwise_binary_D1 (__global char *src1, int src1_step, int src1_offset,
-                                     __global char *src2, int src2_step, int src2_offset,
-                                     __global char *dst,  int dst_step,  int dst_offset,
-                                     int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        char4 src1_data = vload4(0, src1 + src1_index_fix);
-        char4 src2_data = vload4(0, src2 + src2_index_fix);
-
-        if(src1_index < 0)
-        {
-            char4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            char4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-        char4 dst_data = *((__global char4 *)(dst + dst_index));
-        char4 tmp_data = src1_data OP_BINARY src2_data;
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global char4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-
-__kernel void arithm_bitwise_binary_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                     __global ushort *src2, int src2_step, int src2_offset,
-                                     __global ushort *dst,  int dst_step,  int dst_offset,
-                                     int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
-
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index_fix));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix));
-
-        if(src1_index < 0)
-        {
-            ushort4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            ushort4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-        ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
-        ushort4 tmp_data = src1_data OP_BINARY src2_data;
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global ushort4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-}
-
-
-
-__kernel void arithm_bitwise_binary_D3 (__global short *src1, int src1_step, int src1_offset,
-                                     __global short *src2, int src2_step, int src2_offset,
-                                     __global short *dst,  int dst_step,  int dst_offset,
-                                     int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
-
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index_fix));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix));
-
-        if(src1_index < 0)
-        {
-            short4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            short4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-        short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
-        short4 tmp_data = src1_data OP_BINARY src2_data;
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-}
-
-
-
-__kernel void arithm_bitwise_binary_D4 (__global int *src1, int src1_step, int src1_offset,
-                                     __global int *src2, int src2_step, int src2_offset,
-                                     __global int *dst,  int dst_step,  int dst_offset,
-                                     int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        int data1 = *((__global int *)((__global char *)src1 + src1_index));
-        int data2 = *((__global int *)((__global char *)src2 + src2_index));
-        int tmp  = data1 OP_BINARY data2;
-
-        *((__global int *)((__global char *)dst + dst_index)) = tmp;
-    }
-}
-
-__kernel void arithm_bitwise_binary_D5 (__global char *src1, int src1_step, int src1_offset,
-                                     __global char *src2, int src2_step, int src2_offset,
-                                     __global char *dst,  int dst_step,  int dst_offset,
-                                     int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        char4 data1 = *((__global char4 *)((__global char *)src1 + src1_index));
-        char4 data2 = *((__global char4 *)((__global char *)src2 + src2_index));
-        char4 tmp = data1 OP_BINARY data2;
-
-        *((__global char4 *)((__global char *)dst + dst_index)) = tmp;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_bitwise_binary_D6 (__global char *src1, int src1_step, int src1_offset,
-                                     __global char *src2, int src2_step, int src2_offset,
-                                     __global char *dst,  int dst_step,  int dst_offset,
-                                     int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        char8 data1 = *((__global char8 *)((__global char *)src1 + src1_index));
-        char8 data2 = *((__global char8 *)((__global char *)src2 + src2_index));
-
-        *((__global char8 *)((__global char *)dst + dst_index)) = data1 OP_BINARY data2;
-    }
-}
-#endif
diff --git a/modules/ocl/src/opencl/arithm_bitwise_binary_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_binary_mask.cl
index 60cd18820..622ab5b11 100644
--- a/modules/ocl/src/opencl/arithm_bitwise_binary_mask.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_binary_mask.cl
@@ -43,767 +43,31 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-#if defined (DOUBLE_SUPPORT)
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#elif defined (cl_amd_fp64)
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#endif
-#endif
-
-#ifndef OP_BINARY
-#define OP_BINARY &
-#endif
 
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////bitwise_binary////////////////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////////////////////////////
-/**************************************bitwise_binary with mask**************************************/
-__kernel void arithm_bitwise_binary_with_mask_C1_D0 (
-        __global uchar *src1, int src1_step, int src1_offset,
-        __global uchar *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global uchar *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
 
+__kernel void arithm_bitwise_binary_mask(__global uchar * src1, int src1_step, int src1_offset,
+                                    __global uchar * src2, int src2_step, int src2_offset,
+                                    __global uchar * mask, int mask_step, int mask_offset, int elemSize,
+                                    __global uchar * dst, int dst_step, int dst_offset,
+                                    int cols1, int rows)
+{
     int x = get_global_id(0);
     int y = get_global_id(1);
 
-    if (x < cols && y < rows)
+    if (x < cols1 && y < rows)
     {
-        x = x << 2;
+        int mask_index = mad24(y, mask_step, mask_offset + (x / elemSize));
 
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
+        if (mask[mask_index])
+        {
+            int src1_index = mad24(y, src1_step, x + src1_offset);
+            int src2_index = mad24(y, src2_step, x + src2_offset);
+            int dst_index = mad24(y, dst_step, x + dst_offset);
 
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        uchar4 src1_data = vload4(0, src1 + src1_index);
-        uchar4 src2_data = vload4(0, src2 + src2_index);
-        uchar4 mask_data = vload4(0, mask + mask_index);
-
-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
-        data.y = ((mask_data.y) && (dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
-        data.z = ((mask_data.z) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
-        data.w = ((mask_data.w) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
+            dst[dst_index] = src1[src1_index] Operation src2[src2_index];
+        }
     }
 }
-
-
-
-__kernel void arithm_bitwise_binary_with_mask_C1_D1 (
-        __global char *src1, int src1_step, int src1_offset,
-        __global char *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global char *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        char4 src1_data = vload4(0, src1 + src1_index);
-        char4 src2_data = vload4(0, src2 + src2_index);
-        uchar4 mask_data = vload4(0, mask + mask_index);
-
-        char4 data = *((__global char4 *)(dst + dst_index));
-        char4 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.x = convert_char((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
-        data.y = convert_char((mask_data.y) && (dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
-        data.z = convert_char((mask_data.z) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
-        data.w = convert_char((mask_data.w) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
-
-        *((__global char4 *)(dst + dst_index)) = data;
-    }
-}
-
-
-
-__kernel void arithm_bitwise_binary_with_mask_C1_D2 (
-        __global ushort *src1, int src1_step, int src1_offset,
-        __global ushort *src2, int src2_step, int src2_offset,
-        __global uchar  *mask, int mask_step, int mask_offset,
-        __global ushort *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort2 src2_data = vload2(0, (__global ushort *)((__global char *)src2 + src2_index));
-        uchar2  mask_data = vload2(0, mask + mask_index);
-
-        ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
-        ushort2 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.x = convert_ushort((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
-        data.y = convert_ushort((mask_data.y) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : data.y;
-
-        *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
-    }
-}
-
-
-
-__kernel void arithm_bitwise_binary_with_mask_C1_D3 (
-        __global short *src1, int src1_step, int src1_offset,
-        __global short *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global short *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index));
-        short2 src2_data = vload2(0, (__global short *)((__global char *)src2 + src2_index));
-        uchar2  mask_data = vload2(0, mask + mask_index);
-
-        short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
-        short2 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.x = convert_short((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
-        data.y = convert_short((mask_data.y) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : data.y;
-
-        *((__global short2 *)((__global uchar *)dst + dst_index)) = data;
-    }
-}
-
-
-
-__kernel void arithm_bitwise_binary_with_mask_C1_D4 (
-        __global int   *src1, int src1_step, int src1_offset,
-        __global int   *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global int   *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        int src_data1 = *((__global int *)((__global char *)src1 + src1_index));
-        int src_data2 = *((__global int *)((__global char *)src2 + src2_index));
-        int dst_data  = *((__global int *)((__global char *)dst  + dst_index));
-
-        int data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global int *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-
-
-__kernel void arithm_bitwise_binary_with_mask_C1_D5 (
-        __global char *src1, int src1_step, int src1_offset,
-        __global char *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global char *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        char4 src_data1 = *((__global char4 *)((__global char *)src1 + src1_index));
-        char4 src_data2 = *((__global char4 *)((__global char *)src2 + src2_index));
-        char4 dst_data  = *((__global char4 *)((__global char *)dst  + dst_index));
-
-        char4 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global char4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-
-
-__kernel void arithm_bitwise_binary_with_mask_C1_D6 (
-        __global char *src1, int src1_step, int src1_offset,
-        __global char *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global char *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        char8 src_data1 = *((__global char8 *)((__global char *)src1 + src1_index));
-        char8 src_data2 = *((__global char8 *)((__global char *)src2 + src2_index));
-        char8 dst_data  = *((__global char8 *)((__global char *)dst  + dst_index));
-
-        char8 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global char8 *)((__global char *)dst + dst_index)) = data;
-    }
-
-}
-
-
-
-__kernel void arithm_bitwise_binary_with_mask_C2_D0 (
-        __global uchar *src1, int src1_step, int src1_offset,
-        __global uchar *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global uchar *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        uchar4 src1_data = vload4(0, src1 + src1_index);
-        uchar4 src2_data = vload4(0, src2 + src2_index);
-        uchar2 mask_data = vload2(0, mask + mask_index);
-
-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy;
-        data.zw = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.zw : data.zw;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-
-
-__kernel void arithm_bitwise_binary_with_mask_C2_D1 (
-        __global char *src1, int src1_step, int src1_offset,
-        __global char *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global char *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        char4 src1_data = vload4(0, src1 + src1_index);
-        char4 src2_data = vload4(0, src2 + src2_index);
-        uchar2 mask_data = vload2(0, mask + mask_index);
-
-        char4 data = *((__global char4 *)(dst + dst_index));
-        char4 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy;
-        data.zw = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.zw : data.zw;
-
-        *((__global char4 *)(dst + dst_index)) = data;
-    }
-}
-
-__kernel void arithm_bitwise_binary_with_mask_C2_D2 (
-        __global ushort *src1, int src1_step, int src1_offset,
-        __global ushort *src2, int src2_step, int src2_offset,
-        __global uchar  *mask, int mask_step, int mask_offset,
-        __global ushort *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index));
-        ushort2 src_data2 = *((__global ushort2 *)((__global char *)src2 + src2_index));
-        ushort2 dst_data  = *((__global ushort2 *)((__global char *)dst  + dst_index));
-
-        ushort2 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_bitwise_binary_with_mask_C2_D3 (
-        __global short *src1, int src1_step, int src1_offset,
-        __global short *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global short *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index));
-        short2 src_data2 = *((__global short2 *)((__global char *)src2 + src2_index));
-        short2 dst_data  = *((__global short2 *)((__global char *)dst  + dst_index));
-
-        short2 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global short2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_bitwise_binary_with_mask_C2_D4 (
-        __global int   *src1, int src1_step, int src1_offset,
-        __global int   *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global int    *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = *((__global int2 *)((__global char *)src2 + src2_index));
-        int2 dst_data  = *((__global int2 *)((__global char *)dst  + dst_index));
-
-        int2 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global int2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_bitwise_binary_with_mask_C2_D5 (
-        __global char *src1, int src1_step, int src1_offset,
-        __global char *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global char *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        char8 src_data1 = *((__global char8 *)((__global char *)src1 + src1_index));
-        char8 src_data2 = *((__global char8 *)((__global char *)src2 + src2_index));
-        char8 dst_data  = *((__global char8 *)((__global char *)dst  + dst_index));
-
-        char8 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global char8 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-__kernel void arithm_bitwise_binary_with_mask_C2_D6 (
-        __global char *src1, int src1_step, int src1_offset,
-        __global char *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global char *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 4) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        char16 src_data1 = *((__global char16 *)((__global char *)src1 + src1_index));
-        char16 src_data2 = *((__global char16 *)((__global char *)src2 + src2_index));
-        char16 dst_data  = *((__global char16 *)((__global char *)dst  + dst_index));
-
-        char16 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global char16 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-
-__kernel void arithm_bitwise_binary_with_mask_C4_D0 (
-        __global uchar *src1, int src1_step, int src1_offset,
-        __global uchar *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global uchar *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index));
-        uchar4 src_data2 = *((__global uchar4 *)(src2 + src2_index));
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
-
-        uchar4 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-
-
-__kernel void arithm_bitwise_binary_with_mask_C4_D1 (
-        __global char *src1, int src1_step, int src1_offset,
-        __global char *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global char *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        char4 src_data1 = *((__global char4 *)(src1 + src1_index));
-        char4 src_data2 = *((__global char4 *)(src2 + src2_index));
-        char4 dst_data  = *((__global char4 *)(dst  + dst_index));
-
-        char4 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global char4 *)(dst + dst_index)) = data;
-    }
-}
-
-__kernel void arithm_bitwise_binary_with_mask_C4_D2 (
-        __global ushort *src1, int src1_step, int src1_offset,
-        __global ushort *src2, int src2_step, int src2_offset,
-        __global uchar  *mask, int mask_step, int mask_offset,
-        __global ushort *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index));
-        ushort4 src_data2 = *((__global ushort4 *)((__global char *)src2 + src2_index));
-        ushort4 dst_data  = *((__global ushort4 *)((__global char *)dst  + dst_index));
-
-        ushort4 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_bitwise_binary_with_mask_C4_D3 (
-        __global short *src1, int src1_step, int src1_offset,
-        __global short *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global short *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index));
-        short4 src_data2 = *((__global short4 *)((__global char *)src2 + src2_index));
-        short4 dst_data  = *((__global short4 *)((__global char *)dst  + dst_index));
-
-        short4 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_bitwise_binary_with_mask_C4_D4 (
-        __global int   *src1, int src1_step, int src1_offset,
-        __global int   *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global int   *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 4) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index));
-        int4 src_data2 = *((__global int4 *)((__global char *)src2 + src2_index));
-        int4 dst_data  = *((__global int4 *)((__global char *)dst  + dst_index));
-
-        int4 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global int4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_bitwise_binary_with_mask_C4_D5 (
-        __global char *src1, int src1_step, int src1_offset,
-        __global char *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global char *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 4) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        char16 src_data1 = *((__global char16 *)((__global char *)src1 + src1_index));
-        char16 src_data2 = *((__global char16 *)((__global char *)src2 + src2_index));
-        char16 dst_data  = *((__global char16 *)((__global char *)dst  + dst_index));
-
-        char16 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global char16 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_bitwise_binary_with_mask_C4_D6 (
-        __global char *src1, int src1_step, int src1_offset,
-        __global char *src2, int src2_step, int src2_offset,
-        __global uchar  *mask, int mask_step, int mask_offset,
-        __global char *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 5) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 5) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 5) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        char8 src_data1_0 = *((__global char8 *)((__global char *)src1 + src1_index + 0));
-        char8 src_data1_1 = *((__global char8 *)((__global char *)src1 + src1_index + 8));
-        char8 src_data1_2 = *((__global char8 *)((__global char *)src1 + src1_index + 16));
-        char8 src_data1_3 = *((__global char8 *)((__global char *)src1 + src1_index + 24));
-
-        char8 src_data2_0 = *((__global char8 *)((__global char *)src2 + src2_index + 0));
-        char8 src_data2_1 = *((__global char8 *)((__global char *)src2 + src2_index + 8));
-        char8 src_data2_2 = *((__global char8 *)((__global char *)src2 + src2_index + 16));
-        char8 src_data2_3 = *((__global char8 *)((__global char *)src2 + src2_index + 24));
-
-        char8 dst_data_0  = *((__global char8 *)((__global char *)dst  + dst_index + 0));
-        char8 dst_data_1  = *((__global char8 *)((__global char *)dst  + dst_index + 8));
-        char8 dst_data_2  = *((__global char8 *)((__global char *)dst  + dst_index + 16));
-        char8 dst_data_3  = *((__global char8 *)((__global char *)dst  + dst_index + 24));
-
-        char8 data_0 = src_data1_0 OP_BINARY src_data2_0;
-        char8 data_1 = src_data1_1 OP_BINARY src_data2_1;
-        char8 data_2 = src_data1_2 OP_BINARY src_data2_2;
-        char8 data_3 = src_data1_3 OP_BINARY src_data2_3;
-
-        data_0 = mask_data ? data_0 : dst_data_0;
-        data_1 = mask_data ? data_1 : dst_data_1;
-        data_2 = mask_data ? data_2 : dst_data_2;
-        data_3 = mask_data ? data_3 : dst_data_3;
-
-        *((__global char8 *)((__global char *)dst + dst_index + 0)) = data_0;
-        *((__global char8 *)((__global char *)dst + dst_index + 8)) = data_1;
-        *((__global char8 *)((__global char *)dst + dst_index + 16)) = data_2;
-        *((__global char8 *)((__global char *)dst + dst_index + 24)) = data_3;
-    }
-}
-#endif
diff --git a/modules/ocl/src/opencl/arithm_bitwise_binary_scalar.cl b/modules/ocl/src/opencl/arithm_bitwise_binary_scalar.cl
index 5fa25004d..c17b412a6 100644
--- a/modules/ocl/src/opencl/arithm_bitwise_binary_scalar.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_binary_scalar.cl
@@ -43,596 +43,26 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //
-#if defined (DOUBLE_SUPPORT)
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#elif defined (cl_amd_fp64)
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#endif
-#endif
-
-#ifndef OP_BINARY
-#define OP_BINARY &
-#endif
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////bitwise_binary/////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
-/******************************bitwise binary with scalar without mask********************************/
-__kernel void arithm_s_bitwise_binary_C1_D0 (
-        __global   uchar *src1, int src1_step, int src1_offset,
-        __global   uchar *dst,  int dst_step,  int dst_offset,
-        uchar4 src2, int rows, int cols, int dst_step1)
+
+__kernel void arithm_bitwise_binary_scalar(
+        __global uchar *src1, int src1_step, int src1_offset,
+        __global uchar *src2, int elemSize,
+        __global uchar *dst, int dst_step, int dst_offset,
+        int cols1, int rows)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
 
-    if (x < cols && y < rows)
+    if (x < cols1 && y < rows)
     {
-        x = x << 2;
+        int src1_index = mad24(y, src1_step, src1_offset + x);
+        int src2_index = x % elemSize;
+        int dst_index  = mad24(y, dst_step, dst_offset + x);
 
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        uchar4 src1_data = vload4(0, src1 + src1_index);
-        uchar4 src2_data = (uchar4)(src2.x, src2.x, src2.x, src2.x);
-
-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
-        data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
-        data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
-        data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
+        dst[dst_index] = src1[src1_index] Operation src2[src2_index];
     }
 }
-
-
-__kernel void arithm_s_bitwise_binary_C1_D1 (
-        __global   char *src1, int src1_step, int src1_offset,
-        __global   char *dst,  int dst_step,  int dst_offset,
-        char4 src2, int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        char4 src1_data = vload4(0, src1 + src1_index);
-        char4 src2_data = (char4)(src2.x, src2.x, src2.x, src2.x);
-
-        char4 data = *((__global char4 *)(dst + dst_index));
-        char4 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
-        data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
-        data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
-        data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
-
-        *((__global char4 *)(dst + dst_index)) = data;
-    }
-}
-
-__kernel void arithm_s_bitwise_binary_C1_D2 (
-        __global   ushort *src1, int src1_step, int src1_offset,
-        __global   ushort *dst,  int dst_step,  int dst_offset,
-        ushort4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort2 src2_data = (ushort2)(src2.x, src2.x);
-
-        ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
-        ushort2 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x;
-        data.y = (dst_index + 2 <  dst_end  ) ? tmp_data.y : data.y;
-
-        *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_C1_D3 (
-        __global   short *src1, int src1_step, int src1_offset,
-        __global   short *dst,  int dst_step,  int dst_offset,
-        short4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index));
-        short2 src2_data = (short2)(src2.x, src2.x);
-        short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
-
-        short2 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x;
-        data.y = (dst_index + 2 <  dst_end  ) ? tmp_data.y : data.y;
-
-        *((__global short2 *)((__global uchar *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_C1_D4 (
-        __global   int *src1, int src1_step, int src1_offset,
-        __global   int *dst,  int dst_step,  int dst_offset,
-        int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        int src_data1 = *((__global int *)((__global char *)src1 + src1_index));
-        int src_data2 = src2.x;
-
-        int data = src_data1 OP_BINARY src_data2;
-
-        *((__global int *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_C1_D5 (
-        __global   char *src1, int src1_step, int src1_offset,
-        __global   char *dst,  int dst_step,  int dst_offset,
-        char16 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        char4 src1_data = *((__global char4 *)((__global char *)src1 + src1_index));
-        char4 src2_data = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
-
-        char4 data  = *((__global char4 *)((__global char *)dst  + dst_index));
-        char4 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
-        data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
-        data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
-        data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
-
-        *((__global char4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_binary_C1_D6 (
-        __global short *src1, int src1_step, int src1_offset,
-        __global short *dst,  int dst_step,  int dst_offset,
-        short16 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        short4 src1_data = *((__global short4 *)((__global char *)src1 + src1_index));
-        short4 src2_data = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
-
-        short4 tmp_data = src1_data OP_BINARY src2_data;
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = tmp_data;
-    }
-}
-#endif
-__kernel void arithm_s_bitwise_binary_C2_D0 (
-        __global   uchar *src1, int src1_step, int src1_offset,
-        __global   uchar *dst,  int dst_step,  int dst_offset,
-        uchar4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        uchar4 src1_data = vload4(0, src1 + src1_index);
-        uchar4 src2_data = (uchar4)(src2.x, src2.y, src2.x, src2.y);
-
-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = src1_data OP_BINARY src2_data;
-
-
-        data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy;
-        data.zw = (dst_index + 2 <  dst_end  ) ? tmp_data.zw : data.zw;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-
-
-__kernel void arithm_s_bitwise_binary_C2_D1 (
-        __global   char *src1, int src1_step, int src1_offset,
-        __global   char *dst,  int dst_step,  int dst_offset,
-        char4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        char4 src1_data = vload4(0, src1 + src1_index);
-        char4 src2_data = (char4)(src2.x, src2.y, src2.x, src2.y);
-
-        char4 data = *((__global char4 *)(dst + dst_index));
-        char4 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy;
-        data.zw = (dst_index + 2 <  dst_end  ) ? tmp_data.zw : data.zw;
-
-        *((__global char4 *)(dst + dst_index)) = data;
-    }
-}
-
-__kernel void arithm_s_bitwise_binary_C2_D2 (
-        __global   ushort *src1, int src1_step, int src1_offset,
-        __global   ushort *dst,  int dst_step,  int dst_offset,
-        ushort4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index));
-        ushort2 src_data2 = (ushort2)(src2.x, src2.y);
-
-        ushort2 data = src_data1 OP_BINARY src_data2;
-
-        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_C2_D3 (
-        __global   short *src1, int src1_step, int src1_offset,
-        __global   short *dst,  int dst_step,  int dst_offset,
-        short4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index));
-        short2 src_data2 = (short2)(src2.x, src2.y);
-
-        short2 data = src_data1 OP_BINARY src_data2;
-
-        *((__global short2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_C2_D4 (
-        __global   int *src1, int src1_step, int src1_offset,
-        __global   int *dst,  int dst_step,  int dst_offset,
-        int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = (int2)(src2.x, src2.y);
-
-        int2 data = src_data1 OP_BINARY src_data2;
-        *((__global int2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_C2_D5 (
-        __global   char *src1, int src1_step, int src1_offset,
-        __global   char *dst,  int dst_step,  int dst_offset,
-        char16 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        char8 src1_data = *((__global char8 *)((__global char *)src1 + src1_index));
-        char8 src2_data = (char8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);
-
-        char8 tmp_data = src1_data OP_BINARY src2_data;
-
-        *((__global char8 *)((__global char *)dst + dst_index)) = tmp_data;
-    }
-}
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_binary_C2_D6 (
-        __global short *src1, int src1_step, int src1_offset,
-        __global short *dst,  int dst_step,  int dst_offset,
-        short16 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        short8 src1_data = *((__global short8 *)((__global char *)src1 + src1_index));
-        short8 src2_data = (short8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);
-
-        short8 tmp_data = src1_data OP_BINARY src2_data;
-
-        *((__global short8 *)((__global char *)dst + dst_index)) = tmp_data;
-    }
-}
-#endif
-
-__kernel void arithm_s_bitwise_binary_C4_D0 (
-        __global   uchar *src1, int src1_step, int src1_offset,
-        __global   uchar *dst,  int dst_step,  int dst_offset,
-        uchar4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index));
-
-        uchar4 data = src_data1 OP_BINARY src2;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-
-
-__kernel void arithm_s_bitwise_binary_C4_D1 (
-        __global   char *src1, int src1_step, int src1_offset,
-        __global   char *dst,  int dst_step,  int dst_offset,
-        char4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        char4 src_data1 = *((__global char4 *)(src1 + src1_index));
-
-        char4 data = src_data1 OP_BINARY src2;
-
-        *((__global char4 *)(dst + dst_index)) = data;
-    }
-}
-
-__kernel void arithm_s_bitwise_binary_C4_D2 (
-        __global   ushort *src1, int src1_step, int src1_offset,
-        __global   ushort *dst,  int dst_step,  int dst_offset,
-        ushort4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index));
-
-        ushort4 data = src_data1 OP_BINARY src2;
-
-        *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_C4_D3 (
-        __global   short *src1, int src1_step, int src1_offset,
-        __global   short *dst,  int dst_step,  int dst_offset,
-        short4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index));
-
-        short4 data = src_data1 OP_BINARY src2;
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_C4_D4 (
-        __global   int *src1, int src1_step, int src1_offset,
-        __global   int *dst,  int dst_step,  int dst_offset,
-        int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index));
-
-        int4 data = src_data1 OP_BINARY src2;
-
-        *((__global int4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_C4_D5 (
-        __global   char *src1, int src1_step, int src1_offset,
-        __global   char *dst,  int dst_step,  int dst_offset,
-        char16 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        char16 src1_data = *((__global char16 *)((__global char *)src1 + src1_index));
-        char16 src2_data = (char16)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7,
-                                    src2.s8, src2.s9, src2.sa, src2.sb, src2.sc, src2.sd, src2.se, src2.sf);
-
-        char16 tmp_data = src1_data OP_BINARY src2_data;
-
-        *((__global char16 *)((__global char *)dst + dst_index)) = tmp_data;
-    }
-}
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_binary_C4_D6 (
-        __global short *src1, int src1_step, int src1_offset,
-        __global short *dst,  int dst_step,  int dst_offset,
-        short16 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 5) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 5) + dst_offset);
-
-        short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0));
-        short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8));
-        short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16));
-        short4 src1_data_3 = *((__global short4 *)((__global char *)src1 + src1_index + 24));
-
-        short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
-        short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
-        short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
-        short4 src2_data_3 = (short4)(src2.sc, src2.sd, src2.se, src2.sf);
-
-        short4 tmp_data_0 = src1_data_0 OP_BINARY src2_data_0;
-        short4 tmp_data_1 = src1_data_1 OP_BINARY src2_data_1;
-        short4 tmp_data_2 = src1_data_2 OP_BINARY src2_data_2;
-        short4 tmp_data_3 = src1_data_3 OP_BINARY src2_data_3;
-
-        *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
-        *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
-        *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
-        *((__global short4 *)((__global char *)dst + dst_index + 24))= tmp_data_3;
-
-    }
-}
-#endif
diff --git a/modules/ocl/src/opencl/arithm_bitwise_binary_scalar_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_binary_scalar_mask.cl
index 9af6589ad..bae1699a3 100644
--- a/modules/ocl/src/opencl/arithm_bitwise_binary_scalar_mask.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_binary_scalar_mask.cl
@@ -42,6 +42,7 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
+
 #if defined (DOUBLE_SUPPORT)
 #ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
@@ -50,698 +51,29 @@
 #endif
 #endif
 
-#ifndef OP_BINARY
-#define OP_BINARY &
-#endif
-
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////bitwise_binary////////////////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////////////////////////////
-/**************************************bitwise_binary with scalar with mask**************************************/
-__kernel void arithm_s_bitwise_binary_with_mask_C1_D0 (
-        __global   uchar *src1, int src1_step, int src1_offset,
-        __global   uchar *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        uchar4 src2, int rows, int cols, int dst_step1)
-{
 
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        uchar4 src1_data = vload4(0, src1 + src1_index);
-        uchar4 src2_data = (uchar4)(src2.x, src2.x, src2.x, src2.x);
-        uchar4 mask_data = vload4(0, mask + mask_index);
-
-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
-        data.y = ((mask_data.y) && (dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
-        data.z = ((mask_data.z) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
-        data.w = ((mask_data.w) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-
-
-__kernel void arithm_s_bitwise_binary_with_mask_C1_D1 (
-        __global   char *src1, int src1_step, int src1_offset,
-        __global   char *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        char4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        char4 src1_data = vload4(0, src1 + src1_index);
-        char4 src2_data = (char4)(src2.x, src2.x, src2.x, src2.x);
-        uchar4 mask_data = vload4(0, mask + mask_index);
-
-        char4 data = *((__global char4 *)(dst + dst_index));
-        char4 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
-        data.y = ((mask_data.y) && (dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
-        data.z = ((mask_data.z) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
-        data.w = ((mask_data.w) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
-
-        *((__global char4 *)(dst + dst_index)) = data;
-    }
-}
-
-__kernel void arithm_s_bitwise_binary_with_mask_C1_D2 (
-        __global   ushort *src1, int src1_step, int src1_offset,
-        __global   ushort *dst,  int dst_step,  int dst_offset,
-        __global   uchar  *mask, int mask_step, int mask_offset,
-        ushort4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort2 src2_data = (ushort2)(src2.x, src2.x);
-        uchar2  mask_data = vload2(0, mask + mask_index);
-
-        ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
-        ushort2 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x;
-        data.y = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.y : data.y;
-
-        *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_with_mask_C1_D3 (
-        __global   short *src1, int src1_step, int src1_offset,
-        __global   short *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        short4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index));
-        short2 src2_data = (short2)(src2.x, src2.x);
-        uchar2  mask_data = vload2(0, mask + mask_index);
-
-        short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
-        short2 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x;
-        data.y = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.y : data.y;
-
-        *((__global short2 *)((__global uchar *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_with_mask_C1_D4 (
-        __global   int   *src1, int src1_step, int src1_offset,
-        __global   int   *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        int src_data1 = *((__global int *)((__global char *)src1 + src1_index));
-        int src_data2 = src2.x;
-        int dst_data  = *((__global int *)((__global char *)dst  + dst_index));
-
-        int data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global int *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-__kernel void arithm_s_bitwise_binary_with_mask_C1_D5 (
-        __global char *src1, int src1_step, int src1_offset,
-        __global char *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        char16 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        char4 src1_data = *((__global char4 *)((__global char *)src1 + src1_index));
-        char4 src2_data = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
-        char4 dst_data  = *((__global char4 *)((__global char *)dst  + dst_index));
-
-        char4 data = src1_data OP_BINARY src2_data;
-        data = mask_data ? data : dst_data;
-
-        *((__global char4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_binary_with_mask_C1_D6 (
-        __global short *src1, int src1_step, int src1_offset,
-        __global short *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        short16 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        short4 src1_data = *((__global short4 *)((__global char *)src1 + src1_index));
-        short4 src2_data = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
-        short4 dst_data  = *((__global short4 *)((__global char *)dst  + dst_index));
-
-        short4 data = src1_data OP_BINARY src2_data;
-        data = mask_data ? data : dst_data;
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#endif
-__kernel void arithm_s_bitwise_binary_with_mask_C2_D0 (
-        __global   uchar *src1, int src1_step, int src1_offset,
-        __global   uchar *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        uchar4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        uchar4 src1_data = vload4(0, src1 + src1_index);
-        uchar4 src2_data = (uchar4)(src2.x, src2.y, src2.x, src2.y);
-        uchar2 mask_data = vload2(0, mask + mask_index);
-
-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy;
-        data.zw = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.zw : data.zw;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-
-
-__kernel void arithm_s_bitwise_binary_with_mask_C2_D1 (
-        __global   char *src1, int src1_step, int src1_offset,
-        __global   char *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        char4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        char4 src1_data = vload4(0, src1 + src1_index);
-        char4 src2_data = (char4)(src2.x, src2.y, src2.x, src2.y);
-        uchar2 mask_data = vload2(0, mask + mask_index);
-
-        char4 data = *((__global char4 *)(dst + dst_index));
-        char4 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy;
-        data.zw = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.zw : data.zw;
-
-        *((__global char4 *)(dst + dst_index)) = data;
-    }
-}
-
-__kernel void arithm_s_bitwise_binary_with_mask_C2_D2 (
-        __global   ushort *src1, int src1_step, int src1_offset,
-        __global   ushort *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        ushort4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index));
-        ushort2 src_data2 = (ushort2)(src2.x, src2.y);
-        ushort2 dst_data  = *((__global ushort2 *)((__global char *)dst  + dst_index));
-
-        ushort2 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_with_mask_C2_D3 (
-        __global   short *src1, int src1_step, int src1_offset,
-        __global   short *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        short4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index));
-        short2 src_data2 = (short2)(src2.x, src2.y);
-        short2 dst_data  = *((__global short2 *)((__global char *)dst  + dst_index));
-
-        short2 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global short2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_with_mask_C2_D4 (
-        __global   int *src1, int src1_step, int src1_offset,
-        __global   int *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = (int2)(src2.x, src2.y);
-        int2 dst_data  = *((__global int2 *)((__global char *)dst  + dst_index));
-
-        int2 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global int2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_with_mask_C2_D5 (
-        __global   char *src1, int src1_step, int src1_offset,
-        __global  char *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        char16 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        char8 src1_data = *((__global char8 *)((__global char *)src1 + src1_index));
-        char8 src2_data = (char8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);
-        char8 dst_data = *((__global char8 *)((__global char *)dst  + dst_index));
-
-        char8 data = src1_data OP_BINARY src2_data;
-
-        data = mask_data ? data : dst_data;
-
-        *((__global char8 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_binary_with_mask_C2_D6 (
-        __global short *src1, int src1_step, int src1_offset,
-        __global short *dst,  int dst_step,  int dst_offset,
+__kernel void arithm_bitwise_binary_scalar_mask(__global uchar *src1, int src1_step, int src1_offset,
+        __global uchar *src2, int elemSize,
         __global uchar *mask, int mask_step, int mask_offset,
-        short16 src2, int rows, int cols, int dst_step1)
+        __global uchar *dst,  int dst_step,  int dst_offset,
+        int cols, int rows)
 {
-
     int x = get_global_id(0);
     int y = get_global_id(1);
 
     if (x < cols && y < rows)
     {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
+        int mask_index = mad24(y, mask_step, (x / elemSize) + mask_offset);
+        if (mask[mask_index])
+        {
+            int src1_index = mad24(y, src1_step, x + src1_offset);
+            int src2_index = x % elemSize;
+            int dst_index = mad24(y, dst_step, x + dst_offset);
 
-        uchar mask_data = *(mask + mask_index);
-
-        short8 src1_data = *((__global short8 *)((__global char *)src1 + src1_index));
-        short8 src2_data = (short8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);
-        short8 dst_data = *((__global short8 *)((__global char *)dst  + dst_index));
-
-        short8 data = src1_data OP_BINARY src2_data;
-        data = mask_data ? data : dst_data;
-
-        *((__global short8 *)((__global char *)dst + dst_index)) = data;
+            dst[dst_index] = src1[src1_index] Operation src2[src2_index];
+        }
     }
 }
-#endif
-__kernel void arithm_s_bitwise_binary_with_mask_C4_D0 (
-        __global   uchar *src1, int src1_step, int src1_offset,
-        __global   uchar *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        uchar4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index));
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
-
-        uchar4 data = src_data1 OP_BINARY src2;
-        data = mask_data ? data : dst_data;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-
-
-__kernel void arithm_s_bitwise_binary_with_mask_C4_D1 (
-        __global   char *src1, int src1_step, int src1_offset,
-        __global   char *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        char4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        char4 src_data1 = *((__global char4 *)(src1 + src1_index));
-        char4 dst_data  = *((__global char4 *)(dst  + dst_index));
-
-        char4 data = src_data1 OP_BINARY src2;
-        data = mask_data ? data : dst_data;
-
-        *((__global char4 *)(dst + dst_index)) = data;
-    }
-}
-
-__kernel void arithm_s_bitwise_binary_with_mask_C4_D2 (
-        __global   ushort *src1, int src1_step, int src1_offset,
-        __global   ushort *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        ushort4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index));
-        ushort4 dst_data  = *((__global ushort4 *)((__global char *)dst  + dst_index));
-
-        ushort4 data = src_data1 OP_BINARY src2;
-        data = mask_data ? data : dst_data;
-
-        *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_with_mask_C4_D3 (
-        __global   short *src1, int src1_step, int src1_offset,
-        __global   short *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        short4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index));
-        short4 dst_data  = *((__global short4 *)((__global char *)dst  + dst_index));
-
-        short4 data = src_data1 OP_BINARY src2;
-        data = mask_data ? data : dst_data;
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_with_mask_C4_D4 (
-        __global   int *src1, int src1_step, int src1_offset,
-        __global   int *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index));
-        int4 dst_data  = *((__global int4 *)((__global char *)dst  + dst_index));
-
-        int4 data = src_data1 OP_BINARY src2;
-        data = mask_data ? data : dst_data;
-
-        *((__global int4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_with_mask_C4_D5 (
-        __global   char *src1, int src1_step, int src1_offset,
-        __global   char *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        char16 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        char16 src1_data = *((__global char16 *)((__global char *)src1 + src1_index));
-        char16 src2_data = (char16)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7,
-                                    src2.s8, src2.s9, src2.sa, src2.sb, src2.sc, src2.sd, src2.se, src2.sf);
-        char16 dst_data  = *((__global char16 *)((__global char *)dst  + dst_index));
-
-        char16 data = src1_data OP_BINARY src2_data;
-        data = mask_data ? data : dst_data;
-
-        *((__global char16 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_binary_with_mask_C4_D6 (
-        __global short *src1, int src1_step, int src1_offset,
-        __global short *dst,  int dst_step,  int dst_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        short16 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 5) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 5) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0));
-        short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8));
-        short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16));
-        short4 src1_data_3 = *((__global short4 *)((__global char *)src1 + src1_index + 24));
-
-        short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
-        short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
-        short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
-        short4 src2_data_3 = (short4)(src2.sc, src2.sd, src2.se, src2.sf);
-
-        short4 dst_data_0  = *((__global short4 *)((__global char *)dst  + dst_index + 0));
-        short4 dst_data_1  = *((__global short4 *)((__global char *)dst  + dst_index + 8));
-        short4 dst_data_2  = *((__global short4 *)((__global char *)dst  + dst_index + 16));
-        short4 dst_data_3  = *((__global short4 *)((__global char *)dst  + dst_index + 24));
-
-        short4 data_0 = src1_data_0 OP_BINARY src2_data_0;
-        short4 data_1 = src1_data_1 OP_BINARY src2_data_1;
-        short4 data_2 = src1_data_2 OP_BINARY src2_data_2;
-        short4 data_3 = src1_data_3 OP_BINARY src2_data_3;
-
-        data_0 = mask_data ? data_0 : dst_data_0;
-        data_1 = mask_data ? data_1 : dst_data_1;
-        data_2 = mask_data ? data_2 : dst_data_2;
-        data_3 = mask_data ? data_3 : dst_data_3;
-
-        *((__global short4 *)((__global char *)dst + dst_index + 0)) = data_0;
-        *((__global short4 *)((__global char *)dst + dst_index + 8)) = data_1;
-        *((__global short4 *)((__global char *)dst + dst_index + 16)) = data_2;
-        *((__global short4 *)((__global char *)dst + dst_index + 24)) = data_3;
-    }
-}
-#endif

From 073096357657ceb251303dd5162f4a7eecf48846 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Tue, 24 Sep 2013 13:58:18 +0400
Subject: [PATCH 09/17] refactored and extended ocl::compare

---
 modules/ocl/src/arithm.cpp                  |   94 +-
 modules/ocl/src/opencl/arithm_compare.cl    |   74 ++
 modules/ocl/src/opencl/arithm_compare_eq.cl | 1016 -------------------
 modules/ocl/src/opencl/arithm_compare_ne.cl | 1013 ------------------
 4 files changed, 110 insertions(+), 2087 deletions(-)
 create mode 100644 modules/ocl/src/opencl/arithm_compare.cl
 delete mode 100644 modules/ocl/src/opencl/arithm_compare_eq.cl
 delete mode 100644 modules/ocl/src/opencl/arithm_compare_ne.cl

diff --git a/modules/ocl/src/arithm.cpp b/modules/ocl/src/arithm.cpp
index 4f6737e82..1effac213 100644
--- a/modules/ocl/src/arithm.cpp
+++ b/modules/ocl/src/arithm.cpp
@@ -82,8 +82,7 @@ namespace cv
         extern const char *arithm_bitwise_binary_scalar;
         extern const char *arithm_bitwise_binary_scalar_mask;
         extern const char *arithm_bitwise_not;
-        extern const char *arithm_compare_eq;
-        extern const char *arithm_compare_ne;
+        extern const char *arithm_compare;
         extern const char *arithm_transpose;
         extern const char *arithm_flip;
         extern const char *arithm_flip_rc;
@@ -268,76 +267,55 @@ void cv::ocl::absdiff(const oclMat &src1, const Scalar &src2, oclMat &dst)
 //////////////////////////////////////////////////////////////////////////////
 /////////////////////////////////  compare ///////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////
-static void compare_run(const oclMat &src1, const oclMat &src2, oclMat &dst, string kernelName, const char **kernelString)
-{
-    dst.create(src1.size(), CV_8UC1);
-    CV_Assert(src1.oclchannels() == 1);
-    CV_Assert(src1.type() == src2.type());
-    Context  *clCxt = src1.clCxt;
-    int depth = src1.depth();
-    int vector_lengths[7] = {4, 0, 4, 4, 4, 4, 4};
-    size_t vector_length = vector_lengths[depth];
-    int offset_cols = (dst.offset / dst.elemSize1()) & (vector_length - 1);
-    int cols = divUp(dst.cols  + offset_cols, vector_length);
-    size_t localThreads[3]  = { 64, 4, 1 };
-    size_t globalThreads[3] = { cols, dst.rows, 1 };
 
-    int dst_step1 = dst.cols * dst.elemSize();
+static void compare_run(const oclMat &src1, const oclMat &src2, oclMat &dst, int cmpOp,
+                        string kernelName, const char **kernelString)
+{
+    CV_Assert(src1.type() == src2.type());
+    dst.create(src1.size(), CV_8UC1);
+    Context *clCxt = src1.clCxt;
+
+    int depth = src1.depth();
+    size_t localThreads[3]  = { 64, 4, 1 };
+    size_t globalThreads[3] = { dst.cols, dst.rows, 1 };
+
+    int src1step1 = src1.step1(), src1offset1 = src1.offset / src1.elemSize1();
+    int src2step1 = src2.step1(), src2offset1 = src2.offset / src2.elemSize1();
+    int dststep1 = dst.step1(), dstoffset1 = dst.offset / dst.elemSize1();
+
+    const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
+    const char * operationMap[] = { "==", ">", ">=", "<", "<=", "!=" };
+    std::string buildOptions = format("-D T=%s -D Operation=%s", typeMap[depth], operationMap[cmpOp]);
+
     vector<pair<size_t , const void *> > args;
     args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src1.step ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src1.offset ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src1step1 ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src1offset1 ));
     args.push_back( make_pair( sizeof(cl_mem), (void *)&src2.data ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src2.step ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src2.offset ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src2step1 ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src2offset1 ));
     args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&dststep1 ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&dstoffset1 ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src1.cols ));
     args.push_back( make_pair( sizeof(cl_int), (void *)&src1.rows ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&cols ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 ));
-    openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth);
+
+    openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads,
+                        args, -1, -1, buildOptions.c_str());
 }
 
 void cv::ocl::compare(const oclMat &src1, const oclMat &src2, oclMat &dst , int cmpOp)
 {
-    if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
+    if (!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.depth() == CV_64F)
     {
         cout << "Selected device do not support double" << endl;
         return;
     }
-    string kernelName;
-    const char **kernelString = NULL;
-    switch( cmpOp )
-    {
-    case CMP_EQ:
-        kernelName = "arithm_compare_eq";
-        kernelString = &arithm_compare_eq;
-        break;
-    case CMP_GT:
-        kernelName = "arithm_compare_gt";
-        kernelString = &arithm_compare_eq;
-        break;
-    case CMP_GE:
-        kernelName = "arithm_compare_ge";
-        kernelString = &arithm_compare_eq;
-        break;
-    case CMP_NE:
-        kernelName = "arithm_compare_ne";
-        kernelString = &arithm_compare_ne;
-        break;
-    case CMP_LT:
-        kernelName = "arithm_compare_lt";
-        kernelString = &arithm_compare_ne;
-        break;
-    case CMP_LE:
-        kernelName = "arithm_compare_le";
-        kernelString = &arithm_compare_ne;
-        break;
-    default:
-        CV_Error(CV_StsBadArg, "Unknown comparison method");
-    }
-    compare_run(src1, src2, dst, kernelName, kernelString);
+
+    CV_Assert(src1.channels() == 1 && src2.channels() == 1);
+    CV_Assert(cmpOp >= CMP_EQ && cmpOp <= CMP_NE);
+
+    compare_run(src1, src2, dst, cmpOp, "arithm_compare", &arithm_compare);
 }
 
 //////////////////////////////////////////////////////////////////////////////
diff --git a/modules/ocl/src/opencl/arithm_compare.cl b/modules/ocl/src/opencl/arithm_compare.cl
new file mode 100644
index 000000000..d0842db18
--- /dev/null
+++ b/modules/ocl/src/opencl/arithm_compare.cl
@@ -0,0 +1,74 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
+#endif
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////addWeighted//////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+
+__kernel void arithm_compare(__global T * src1, int src1_step1, int src1_offset1,
+                              __global T * src2, int src2_step1, int src2_offset1,
+                              __global uchar * dst, int dst_step1, int dst_offset1,
+                              int cols1, int rows)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols1 && y < rows)
+    {
+        int src1_index = mad24(y, src1_step1, x + src1_offset1);
+        int src2_index = mad24(y, src2_step1, x + src2_offset1);
+        int dst_index = mad24(y, dst_step1, x + dst_offset1);
+
+        dst[dst_index] = convert_uchar(src1[src1_index] Operation src2[src2_index] ? 255 : 0);
+    }
+}
diff --git a/modules/ocl/src/opencl/arithm_compare_eq.cl b/modules/ocl/src/opencl/arithm_compare_eq.cl
deleted file mode 100644
index 16a56acef..000000000
--- a/modules/ocl/src/opencl/arithm_compare_eq.cl
+++ /dev/null
@@ -1,1016 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jiang Liyuan, jlyuan001.good@163.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other oclMaterials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-#if defined (DOUBLE_SUPPORT)
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#elif defined (cl_amd_fp64)
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#endif
-#endif
-
-//////////////////////////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////Compare EQ////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-
-__kernel void arithm_compare_eq_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                    __global uchar *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
-        uchar4 src2_data = vload4(0, src2 + src2_index_fix);
-        if(src1_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-
-__kernel void arithm_compare_ne_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                    __global ushort *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1)& 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
-        if(src1_index < 0)
-        {
-            ushort4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            ushort4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-
-__kernel void arithm_compare_eq_D3 (__global short *src1, int src1_step, int src1_offset,
-                                    __global short *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
-        if(src1_index < 0)
-        {
-            short4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            short4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-
-
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-
-
-__kernel void arithm_compare_eq_D4 (__global int *src1, int src1_step, int src1_offset,
-                                    __global int *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 2) & 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-
-        int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
-        int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
-        if(src1_index < 0)
-        {
-            int4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            int4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-__kernel void arithm_compare_eq_D5 (__global float *src1, int src1_step, int src1_offset,
-                                    __global float *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 2) & 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
-        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
-        if(src2_index < 0)
-        {
-            float4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_compare_eq_D6 (__global double *src1, int src1_step, int src1_offset,
-                                    __global double *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 3) & 3)
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
-        double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
-        if(src1_index < 0)
-        {
-            double4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            double4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-#endif
-
-/***********************************Compare GT**************************/
-__kernel void arithm_compare_gt_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                    __global uchar *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
-        uchar4 src2_data = vload4(0, src2 + src2_index_fix);
-        if(src1_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-__kernel void arithm_compare_gt_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                    __global ushort *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
-        if(src1_index < 0)
-        {
-            ushort4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            ushort4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-
-
-__kernel void arithm_compare_gt_D3 (__global short *src1, int src1_step, int src1_offset,
-                                    __global short *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
-        if(src1_index < 0)
-        {
-            short4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            short4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-__kernel void arithm_compare_gt_D4 (__global int *src1, int src1_step, int src1_offset,
-                                    __global int *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 2) & 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-
-        int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
-        int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
-        if(src1_index < 0)
-        {
-            int4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            int4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-__kernel void arithm_compare_gt_D5 (__global float *src1, int src1_step, int src1_offset,
-                                    __global float *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 2) & 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
-        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
-        if(src1_index < 0)
-        {
-            float4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            float4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_compare_gt_D6 (__global double *src1, int src1_step, int src1_offset,
-                                    __global double *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 3) & 3)
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
-        double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
-        if(src1_index < 0)
-        {
-            double4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            double4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-#endif
-
-/***********************************Compare GE**************************/
-__kernel void arithm_compare_ge_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                    __global uchar *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
-        uchar4 src2_data = vload4(0, src2 + src2_index_fix);
-        if(src1_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-
-
-__kernel void arithm_compare_ge_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                    __global ushort *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
-        if(src1_index < 0)
-        {
-            ushort4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            ushort4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-
-
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-
-
-__kernel void arithm_compare_ge_D3 (__global short *src1, int src1_step, int src1_offset,
-                                    __global short *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1)& 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
-        if(src1_index < 0)
-        {
-            short4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            short4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-__kernel void arithm_compare_ge_D4 (__global int *src1, int src1_step, int src1_offset,
-                                    __global int *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 2)& 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-
-        int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
-        int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
-        if(src1_index < 0)
-        {
-            int4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            int4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-__kernel void arithm_compare_ge_D5 (__global float *src1, int src1_step, int src1_offset,
-                                    __global float *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 2)& 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
-        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
-        if(src1_index < 0)
-        {
-
-            float4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            float4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_compare_ge_D6 (__global double *src1, int src1_step, int src1_offset,
-                                    __global double *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 3)& 3)
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
-        double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
-        if(src1_index < 0)
-        {
-            double4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            double4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-#endif
diff --git a/modules/ocl/src/opencl/arithm_compare_ne.cl b/modules/ocl/src/opencl/arithm_compare_ne.cl
deleted file mode 100644
index fb5859d3b..000000000
--- a/modules/ocl/src/opencl/arithm_compare_ne.cl
+++ /dev/null
@@ -1,1013 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jiang Liyuan, jlyuan001.good@163.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other oclMaterials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-#if defined (DOUBLE_SUPPORT)
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#elif defined (cl_amd_fp64)
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#endif
-#endif
-/***********************************Compare NE*******************************/
-__kernel void arithm_compare_ne_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                    __global uchar *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
-        uchar4 src2_data = vload4(0, src2 + src2_index_fix);
-        if(src1_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-
-
-__kernel void arithm_compare_ne_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                    __global ushort *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1)& 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
-        if(src1_index < 0)
-        {
-            ushort4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            ushort4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-
-
-__kernel void arithm_compare_ne_D3 (__global short *src1, int src1_step, int src1_offset,
-                                    __global short *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1)& 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
-        if(src1_index < 0)
-        {
-            short4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            short4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-__kernel void arithm_compare_ne_D4 (__global int *src1, int src1_step, int src1_offset,
-                                    __global int *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 2)& 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-
-        int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
-        int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
-        if(src1_index < 0)
-        {
-            int4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            int4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-__kernel void arithm_compare_ne_D5 (__global float *src1, int src1_step, int src1_offset,
-                                    __global float *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 2) & 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
-        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
-        if(src1_index < 0)
-        {
-            float4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            float4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_compare_ne_D6 (__global double *src1, int src1_step, int src1_offset,
-                                    __global double *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 3) & 3)
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
-        double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
-        if(src1_index < 0)
-        {
-            double4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            double4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-#endif
-
-
-/***********************************Compare LT*******************************/
-__kernel void arithm_compare_lt_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                    __global uchar *src2, int src2_step, int src2_offset,
-                                    __global  uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
-        uchar4 src2_data = vload4(0, src2 + src2_index_fix);
-        if(src1_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-
-
-__kernel void arithm_compare_lt_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                    __global ushort *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
-        if(src1_index < 0)
-        {
-            ushort4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            ushort4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-
-
-__kernel void arithm_compare_lt_D3 (__global short *src1, int src1_step, int src1_offset,
-                                    __global short *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
-        if(src1_index < 0)
-        {
-            short4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            short4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-__kernel void arithm_compare_lt_D4 (__global int *src1, int src1_step, int src1_offset,
-                                    __global int *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 2) & 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-
-        int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
-        int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
-        if(src1_index < 0)
-        {
-            int4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            int4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-
-
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-__kernel void arithm_compare_lt_D5 (__global float *src1, int src1_step, int src1_offset,
-                                    __global float *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 2) & 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
-        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
-        if(src1_index < 0)
-        {
-            float4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            float4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_compare_lt_D6 (__global double *src1, int src1_step, int src1_offset,
-                                    __global double *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 3) & 3)
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
-        double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
-        if(src1_index < 0)
-        {
-            double4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            double4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-#endif
-
-/***********************************Compare LE*******************************/
-__kernel void arithm_compare_le_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                    __global uchar *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
-        uchar4 src2_data = vload4(0, src2 + src2_index_fix);
-        if(src1_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data <= src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-
-
-__kernel void arithm_compare_le_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                    __global ushort *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
-        if(src1_index < 0)
-        {
-            ushort4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            ushort4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data <= src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-
-
-__kernel void arithm_compare_le_D3 (__global short *src1, int src1_step, int src1_offset,
-                                    __global short *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
-        if(src1_index < 0)
-        {
-            short4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            short4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data <= src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-__kernel void arithm_compare_le_D4 (__global int *src1, int src1_step, int src1_offset,
-                                    __global int *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 2)& 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-
-        int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
-        int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
-        if(src1_index < 0)
-        {
-            int4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            int4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
-        uchar4 tmp_data =convert_uchar4((src1_data <= src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-__kernel void arithm_compare_le_D5 (__global float *src1, int src1_step, int src1_offset,
-                                    __global float *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 2)& 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
-        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
-        if(src1_index < 0)
-        {
-            float4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            float4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data <= src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_compare_le_D6 (__global double *src1, int src1_step, int src1_offset,
-                                    __global double *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 3)& 3)
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
-        double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
-        if(src1_index < 0)
-        {
-            double4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            double4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data <= src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-#endif

From 799afab23b2ffe632a4379c2c8cb59aea6523466 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Tue, 24 Sep 2013 14:02:01 +0400
Subject: [PATCH 10/17] refactored and extended ocl::exp and ocl::log

---
 modules/ocl/src/arithm.cpp           | 41 ++++++++-------
 modules/ocl/src/opencl/arithm_exp.cl | 76 +++++++++++++++++-----------
 modules/ocl/src/opencl/arithm_log.cl | 65 ++++++++++++++----------
 3 files changed, 109 insertions(+), 73 deletions(-)

diff --git a/modules/ocl/src/arithm.cpp b/modules/ocl/src/arithm.cpp
index 1effac213..97da8c08d 100644
--- a/modules/ocl/src/arithm.cpp
+++ b/modules/ocl/src/arithm.cpp
@@ -817,39 +817,44 @@ void cv::ocl::LUT(const oclMat &src, const oclMat &lut, oclMat &dst)
 //////////////////////////////////////////////////////////////////////////////
 //////////////////////////////// exp log /////////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////
+
 static void arithmetic_exp_log_run(const oclMat &src, oclMat &dst, string kernelName, const char **kernelString)
 {
-    dst.create(src.size(), src.type());
-    CV_Assert(src.cols == dst.cols &&
-              src.rows == dst.rows );
-
-    CV_Assert(src.type() == dst.type());
-    CV_Assert( src.type() == CV_32F || src.type() == CV_64F);
-
     Context  *clCxt = src.clCxt;
-    if(!clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F)
+    if (!clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
     {
-        CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
+        CV_Error(CV_GpuNotSupported, "Selected device doesn't support double\r\n");
         return;
     }
-    //int channels = dst.oclchannels();
-    int depth = dst.depth();
+
+    CV_Assert( src.depth() == CV_32F || src.depth() == CV_64F);
+    dst.create(src.size(), src.type());
+
+    int ddepth = dst.depth();
+    int cols1 = src.cols * src.oclchannels();
+    int srcoffset1 = src.offset / src.elemSize1(), dstoffset1 = dst.offset / dst.elemSize1();
+    int srcstep1 = src.step1(), dststep1 = dst.step1();
 
     size_t localThreads[3]  = { 64, 4, 1 };
     size_t globalThreads[3] = { dst.cols, dst.rows, 1 };
 
+    std::string buildOptions = format("-D srcT=%s",
+                                      ddepth == CV_32F ? "float" : "double");
+
     vector<pair<size_t , const void *> > args;
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src.cols ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src.step ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src.offset ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset ));
     args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data ));
     args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&cols1 ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&srcoffset1 ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&dstoffset1 ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&srcstep1 ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&dststep1 ));
 
-    openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth);
+    openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads,
+                        args, src.oclchannels(), -1, buildOptions.c_str());
 }
+
 void cv::ocl::exp(const oclMat &src, oclMat &dst)
 {
     arithmetic_exp_log_run(src, dst, "arithm_exp", &arithm_exp);
diff --git a/modules/ocl/src/opencl/arithm_exp.cl b/modules/ocl/src/opencl/arithm_exp.cl
index 6f537a287..b2143ba14 100644
--- a/modules/ocl/src/opencl/arithm_exp.cl
+++ b/modules/ocl/src/opencl/arithm_exp.cl
@@ -42,52 +42,70 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-#if defined (DOUBLE_SUPPORT)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
 
+#if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
+#endif
 
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 /////////////////////////////////////////////EXP//////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 
-__kernel void arithm_exp_D5(int rows, int cols, int srcStep, int dstStep, int srcOffset, int dstOffset, __global float *src, __global float *dst)
+__kernel void arithm_exp_C1(__global srcT *src, __global srcT *dst,
+    int cols1, int rows,
+    int srcOffset1, int dstOffset1,
+    int srcStep1, int dstStep1)
 {
-
     int x = get_global_id(0);
     int y = get_global_id(1);
 
-    if(x < cols && y < rows)
+    if(x < cols1 && y < rows)
     {
-      x = x << 2;
-      int srcIdx = mad24( y, srcStep, x + srcOffset);
-      int dstIdx = mad24( y, dstStep, x + dstOffset);
-
-      float src_data = *((__global float *)((__global char *)src + srcIdx));
-      float dst_data = exp(src_data);
-
-      *((__global float *)((__global char *)dst + dstIdx)) = dst_data;
+        int srcIdx = mad24(y, srcStep1, x + srcOffset1);
+        int dstIdx = mad24(y, dstStep1, x + dstOffset1);
 
+        dst[dstIdx] = exp(src[srcIdx]);
     }
 }
 
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_exp_D6(int rows, int cols, int srcStep, int dstStep, int srcOffset, int dstOffset, __global double *src, __global double *dst)
+__kernel void arithm_exp_C2(__global srcT *src, __global srcT *dst,
+    int cols1, int rows,
+    int srcOffset1, int dstOffset1,
+    int srcStep1, int dstStep1)
 {
-  int x = get_global_id(0);
-  int y = get_global_id(1);
-  if(x < cols && y < rows )
-  {
-      x = x << 3;
-      int srcIdx = mad24( y, srcStep, x + srcOffset);
-      int dstIdx = mad24( y, dstStep, x + dstOffset);
+    int x1 = get_global_id(0) << 1;
+    int y = get_global_id(1);
 
-      double src_data = *((__global double *)((__global char *)src + srcIdx));
-      double dst_data = exp(src_data);
+    if(x1 < cols1 && y < rows)
+    {
+        int srcIdx = mad24(y, srcStep1, x1 + srcOffset1);
+        int dstIdx = mad24(y, dstStep1, x1 + dstOffset1);
 
-      *((__global double *)((__global char *)dst + dstIdx )) = dst_data;
-     // dst[dstIdx] = exp(src[srcIdx]);
-  }
+        dst[dstIdx] =                      exp(src[srcIdx]);
+        dst[dstIdx + 1] = x1 + 1 < cols1 ? exp(src[srcIdx + 1]) : dst[dstIdx + 1];
+    }
 }
 
-#endif
+__kernel void arithm_exp_C4(__global srcT *src, __global srcT *dst,
+    int cols1, int rows,
+    int srcOffset1, int dstOffset1,
+    int srcStep1, int dstStep1)
+{
+    int x1 = get_global_id(0) << 2;
+    int y = get_global_id(1);
+
+    if(x1 < cols1 && y < rows)
+    {
+        int srcIdx = mad24(y, srcStep1, x1 + srcOffset1);
+        int dstIdx = mad24(y, dstStep1, x1 + dstOffset1);
+
+        dst[dstIdx] =                      exp(src[srcIdx]);
+        dst[dstIdx + 1] = x1 + 1 < cols1 ? exp(src[srcIdx + 1]) : dst[dstIdx + 1];
+        dst[dstIdx + 2] = x1 + 2 < cols1 ? exp(src[srcIdx + 2]) : dst[dstIdx + 2];
+        dst[dstIdx + 3] = x1 + 3 < cols1 ? exp(src[srcIdx + 3]) : dst[dstIdx + 3];
+    }
+}
diff --git a/modules/ocl/src/opencl/arithm_log.cl b/modules/ocl/src/opencl/arithm_log.cl
index ea19c9d90..ef8c4dd04 100644
--- a/modules/ocl/src/opencl/arithm_log.cl
+++ b/modules/ocl/src/opencl/arithm_log.cl
@@ -1,4 +1,3 @@
-
 /*M///////////////////////////////////////////////////////////////////////////////////////
 //
 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
@@ -43,52 +42,66 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
+
 #if defined (DOUBLE_SUPPORT)
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
 #endif
 
-#define INF_FLOAT -88.029694
-#define INF_DOUBLE -709.0895657128241
-
-
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 /////////////////////////////////////////////LOG/////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 
-__kernel void arithm_log_D5(int rows, int cols, int srcStep, int dstStep, int srcOffset, int dstOffset, __global float *src, __global float *dst)
+__kernel void arithm_log_C1(__global srcT *src, __global srcT *dst,
+    int cols1, int rows,
+    int srcOffset1, int dstOffset1,
+    int srcStep1, int dstStep1)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
 
-    if(x < cols && y < rows )
+    if(x < cols1 && y < rows)
     {
-      x = x << 2;
-      int srcIdx = mad24( y, srcStep, x + srcOffset);
-      int dstIdx = mad24( y, dstStep, x + dstOffset);
+        int srcIdx = mad24(y, srcStep1, x + srcOffset1);
+        int dstIdx = mad24(y, dstStep1, x + dstOffset1);
 
-      float src_data = *((__global float *)((__global char *)src + srcIdx));
-      float dst_data = (src_data == 0) ? INF_FLOAT : log(fabs(src_data));
-
-      *((__global float *)((__global char *)dst + dstIdx)) = dst_data;
+        dst[dstIdx] = log(src[srcIdx]);
     }
 }
 
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_log_D6(int rows, int cols, int srcStep, int dstStep, int srcOffset, int dstOffset, __global double *src, __global double *dst)
+__kernel void arithm_log_C2(__global srcT *src, __global srcT *dst,
+    int cols1, int rows,
+    int srcOffset1, int dstOffset1,
+    int srcStep1, int dstStep1)
 {
-    int x = get_global_id(0);
+    int x1 = get_global_id(0) << 1;
     int y = get_global_id(1);
 
-    if(x < cols && y < rows )
+    if(x1 < cols1 && y < rows)
     {
-      x = x << 3;
-      int srcIdx = mad24( y, srcStep, x + srcOffset);
-      int dstIdx = mad24( y, dstStep, x + dstOffset);
-
-      double src_data = *((__global double *)((__global char *)src + srcIdx));
-      double dst_data = (src_data == 0) ? INF_DOUBLE : log(fabs(src_data));
-      *((__global double *)((__global char *)dst + dstIdx)) = dst_data;
+        int srcIdx = mad24(y, srcStep1, x1 + srcOffset1);
+        int dstIdx = mad24(y, dstStep1, x1 + dstOffset1);
 
+        dst[dstIdx] =                      log(src[srcIdx]);
+        dst[dstIdx + 1] = x1 + 1 < cols1 ? log(src[srcIdx + 1]) : dst[dstIdx + 1];
+    }
+}
+
+__kernel void arithm_log_C4(__global srcT *src, __global srcT *dst,
+    int cols1, int rows,
+    int srcOffset1, int dstOffset1,
+    int srcStep1, int dstStep1)
+{
+    int x1 = get_global_id(0) << 2;
+    int y = get_global_id(1);
+
+    if(x1 < cols1 && y < rows)
+    {
+        int srcIdx = mad24(y, srcStep1, x1 + srcOffset1);
+        int dstIdx = mad24(y, dstStep1, x1 + dstOffset1);
+
+        dst[dstIdx] =                      log(src[srcIdx]);
+        dst[dstIdx + 1] = x1 + 1 < cols1 ? log(src[srcIdx + 1]) : dst[dstIdx + 1];
+        dst[dstIdx + 2] = x1 + 2 < cols1 ? log(src[srcIdx + 2]) : dst[dstIdx + 2];
+        dst[dstIdx + 3] = x1 + 3 < cols1 ? log(src[srcIdx + 3]) : dst[dstIdx + 3];
     }
 }
-#endif

From b4ad12821861c3527ed06755b0959ebbeab956b9 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Tue, 24 Sep 2013 14:07:54 +0400
Subject: [PATCH 11/17] refactoed and extended ocl::transpose

---
 modules/ocl/src/arithm.cpp                 |  72 ++--
 modules/ocl/src/opencl/arithm_transpose.cl | 476 ++-------------------
 2 files changed, 65 insertions(+), 483 deletions(-)

diff --git a/modules/ocl/src/arithm.cpp b/modules/ocl/src/arithm.cpp
index 97da8c08d..b18fa44dc 100644
--- a/modules/ocl/src/arithm.cpp
+++ b/modules/ocl/src/arithm.cpp
@@ -1511,57 +1511,51 @@ oclMatExpr::operator oclMat() const
 //////////////////////////////////////////////////////////////////////////////
 /////////////////////////////// transpose ////////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////
+
 #define TILE_DIM      (32)
 #define BLOCK_ROWS    (256/TILE_DIM)
+
 static void transpose_run(const oclMat &src, oclMat &dst, string kernelName)
 {
-    if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F)
+    Context  *clCxt = src.clCxt;
+    if (!clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
     {
-        CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
+        CV_Error(CV_GpuNotSupported, "Selected device doesn't support double\r\n");
         return;
     }
 
-    CV_Assert(src.cols == dst.rows && src.rows == dst.cols);
-
-    Context  *clCxt = src.clCxt;
-    int channels = src.oclchannels();
-    int depth = src.depth();
-
-    int vector_lengths[4][7] = {{1, 0, 0, 0, 1, 1, 0},
-        {0, 0, 1, 1, 0, 0, 0},
-        {0, 0, 0, 0 , 0, 0, 0},
-        {1, 1, 0, 0, 0, 0, 0}
-    };
-
-    size_t vector_length = vector_lengths[channels - 1][depth];
-    int offset_cols = ((dst.offset % dst.step) / dst.elemSize()) & (vector_length - 1);
-    int cols = divUp(src.cols + offset_cols, vector_length);
+    const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
+    const char channelsString[] = { ' ', ' ', '2', '4', '4' };
+    std::string buildOptions = format("-D T=%s%c", typeMap[src.depth()],
+                                      channelsString[src.channels()]);
 
     size_t localThreads[3]  = { TILE_DIM, BLOCK_ROWS, 1 };
-    size_t globalThreads[3] = { cols, src.rows, 1 };
+    size_t globalThreads[3] = { src.cols, src.rows, 1 };
+
+    int srcstep1 = src.step / src.elemSize(), dststep1 = dst.step / dst.elemSize();
+    int srcoffset1 = src.offset / src.elemSize(), dstoffset1 = dst.offset / dst.elemSize();
 
     vector<pair<size_t , const void *> > args;
     args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src.step ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src.offset ));
     args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src.cols ));
     args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&cols ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&srcstep1 ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&dststep1 ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&srcoffset1 ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&dstoffset1 ));
 
-    openCLExecuteKernel(clCxt, &arithm_transpose, kernelName, globalThreads, localThreads, args, channels, depth);
+    openCLExecuteKernel(clCxt, &arithm_transpose, kernelName, globalThreads, localThreads,
+                        args, -1, -1, buildOptions.c_str());
 }
 
 void cv::ocl::transpose(const oclMat &src, oclMat &dst)
 {
-    CV_Assert(src.type() == CV_8UC1  || src.type() == CV_8UC3 || src.type() == CV_8UC4  || src.type() == CV_8SC3  || src.type() == CV_8SC4  ||
-              src.type() == CV_16UC2 || src.type() == CV_16SC2 || src.type() == CV_32SC1 || src.type() == CV_32FC1);
+    CV_Assert(src.depth() <= CV_64F && src.channels() <= 4);
 
-    oclMat emptyMat;
-
-    if( src.data == dst.data && dst.cols == dst.rows )
-        transpose_run( src, emptyMat, "transposeI_");
+    if ( src.data == dst.data && src.cols == src.rows && dst.offset == src.offset
+            && dst.rows == dst.cols && src.cols == dst.cols)
+        transpose_run( src, dst, "transpose_inplace");
     else
     {
         dst.create(src.cols, src.rows, src.type());
@@ -1569,6 +1563,10 @@ void cv::ocl::transpose(const oclMat &src, oclMat &dst)
     }
 }
 
+//////////////////////////////////////////////////////////////////////////////
+////////////////////////////// addWeighted ///////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////
+
 void cv::ocl::addWeighted(const oclMat &src1, double alpha, const oclMat &src2, double beta, double gama, oclMat &dst)
 {
     Context *clCxt = src1.clCxt;
@@ -1633,6 +1631,10 @@ void cv::ocl::addWeighted(const oclMat &src1, double alpha, const oclMat &src2,
                         args, -1, -1, buildOptions.c_str());
 }
 
+//////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////// Pow //////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////
+
 static void arithmetic_pow_run(const oclMat &src1, double p, oclMat &dst, string kernelName, const char **kernelString)
 {
     CV_Assert(src1.cols == dst.cols && src1.rows == dst.rows);
@@ -1671,6 +1673,7 @@ static void arithmetic_pow_run(const oclMat &src1, double p, oclMat &dst, string
 
     openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth);
 }
+
 void cv::ocl::pow(const oclMat &x, double p, oclMat &y)
 {
     if(!x.clCxt->supportsFeature(Context::CL_DOUBLE) && x.type() == CV_64F)
@@ -1685,6 +1688,11 @@ void cv::ocl::pow(const oclMat &x, double p, oclMat &y)
 
     arithmetic_pow_run(x, p, y, kernelName, &arithm_pow);
 }
+
+//////////////////////////////////////////////////////////////////////////////
+/////////////////////////////// setIdentity //////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////
+
 void cv::ocl::setIdentity(oclMat& src, double scalar)
 {
     CV_Assert(src.empty() == false && src.rows == src.cols);
@@ -1711,7 +1719,6 @@ void cv::ocl::setIdentity(oclMat& src, double scalar)
 
     }
 
-
     vector<pair<size_t , const void *> > args;
     args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data ));
     args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows));
@@ -1735,7 +1742,8 @@ void cv::ocl::setIdentity(oclMat& src, double scalar)
         {
             scalar_i = (int)scalar;
             args.push_back(make_pair(sizeof(cl_int), (void*)&scalar_i));
-        }else
+        }
+        else
         {
             scalar_f = (float)scalar;
             args.push_back(make_pair(sizeof(cl_float), (void*)&scalar_f));
diff --git a/modules/ocl/src/opencl/arithm_transpose.cl b/modules/ocl/src/opencl/arithm_transpose.cl
index d0725b017..57f7f1b9d 100644
--- a/modules/ocl/src/opencl/arithm_transpose.cl
+++ b/modules/ocl/src/opencl/arithm_transpose.cl
@@ -43,468 +43,42 @@
 //
 //M*/
 
-#define TILE_DIM      32
-#define BLOCK_ROWS    8
-#define LDS_STEP     (TILE_DIM + 1)
+#if defined (DOUBLE_SUPPORT)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
 
-
-//8UC1 is not unoptimized, as the size of write per thread is 8
-//which will use completepath
-__kernel void transpose_C1_D0(__global uchar* src, int src_step, int src_offset,
-                              __global uchar* dst, int dst_step, int dst_offset,
-                              int src_rows, int src_cols)
+__kernel void transpose(__global const T* src, __global T* dst,
+    int src_cols, int src_rows,
+    int src_step, int dst_step,
+    int src_offset, int dst_offset)
 {
+    int x = get_global_id(0);
+    int y = get_global_id(1);
 
-    int gp_x = get_group_id(0),   gp_y = get_group_id(1);
-    int gs_x = get_num_groups(0), gs_y = get_num_groups(1);
-
-    int groupId_x, groupId_y;
-
-    if(src_rows == src_cols)
+    if (x < src_cols && y < src_rows)
     {
-        groupId_y = gp_x;
-        groupId_x = (gp_x + gp_y) % gs_x;
-    }
-    else
-    {
-        int bid = gp_x + gs_x * gp_y;
-        groupId_y =  bid % gs_y;
-        groupId_x = ((bid / gs_y) + groupId_y) % gs_x;
-    }
+        int srcIdx = mad24(y, src_step, src_offset + x);
+        int dstIdx = mad24(x, dst_step, dst_offset + y);
 
-    int lx = get_local_id(0);
-    int ly = get_local_id(1);
-
-    int x = groupId_x * TILE_DIM + lx;
-    int y = groupId_y * TILE_DIM + ly;
-
-    int x_index = groupId_y * TILE_DIM + lx;
-    int y_index = groupId_x * TILE_DIM + ly;
-
-    __local uchar title[TILE_DIM * LDS_STEP];
-
-    if(x < src_cols && y < src_rows)
-    {
-        int index_src = mad24(y, src_step, x);
-
-        #pragma unroll
-        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
-        {
-            if(y + i < src_rows)
-            {
-                title[(ly + i) * LDS_STEP + lx] =*(src + src_offset + index_src);
-                index_src = mad24(BLOCK_ROWS, src_step, index_src);
-            }
-        }
-     }
-
-     barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(x_index < src_rows && y_index < src_cols)
-    {
-        int index_dst = mad24(y_index, dst_step, x_index);
-
-        #pragma unroll
-        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
-        {
-            if((y_index + i) < src_cols)
-            {
-                *(dst + dst_offset + index_dst ) = title[lx * LDS_STEP + ly + i];
-                index_dst +=  dst_step * BLOCK_ROWS ;
-            }
-        }
+        dst[dstIdx] = src[srcIdx];
     }
 }
 
-__kernel void transpose_C1_D4(__global int* src, int src_step, int src_offset,
-                              __global int* dst, int dst_step, int dst_offset,
-                              int src_rows, int src_cols)
+__kernel void transpose_inplace(__global T* src, __global T* dst,
+    int src_cols, int src_rows,
+    int src_step, int dst_step,
+    int src_offset, int dst_offset)
 {
+    int x = get_global_id(0);
+    int y = get_global_id(1);
 
-    int gp_x = get_group_id(0),   gp_y = get_group_id(1);
-    int gs_x = get_num_groups(0), gs_y = get_num_groups(1);
-
-    int groupId_x, groupId_y;
-
-    if(src_rows == src_cols)
+    if (x < src_cols && y < src_rows && x < y)
     {
-        groupId_y = gp_x;
-        groupId_x = (gp_x + gp_y) % gs_x;
-    }
-    else
-    {
-        int bid = gp_x + gs_x * gp_y;
-        groupId_y =  bid % gs_y;
-        groupId_x = ((bid / gs_y) + groupId_y) % gs_x;
-    }
+        int srcIdx = mad24(y, src_step, src_offset + x);
+        int dstIdx = mad24(x, dst_step, dst_offset + y);
 
-    int lx = get_local_id(0);
-    int ly = get_local_id(1);
-
-    int x = groupId_x * TILE_DIM + lx;
-    int y = groupId_y * TILE_DIM + ly;
-
-    int x_index = groupId_y * TILE_DIM + lx;
-    int y_index = groupId_x * TILE_DIM + ly;
-
-    __local int title[TILE_DIM * LDS_STEP];
-
-    if(x < src_cols && y < src_rows)
-    {
-        int index_src = mad24(y, src_step, (x << 2));
-
-        #pragma unroll
-        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
-        {
-            if(y + i < src_rows)
-            {
-                title[(ly + i) * LDS_STEP + lx] = *((__global int *)((__global char*)src + src_offset + index_src));
-                index_src = mad24(BLOCK_ROWS, src_step, index_src);
-            }
-        }
-     }
-
-     barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(x_index < src_rows && y_index < src_cols)
-    {
-        int index_dst = mad24(y_index, dst_step, (x_index << 2));
-
-        #pragma unroll
-        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
-        {
-            if((y_index + i) < src_cols)
-            {
-                *((__global int*)((__global char*)dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];
-                index_dst +=  dst_step * BLOCK_ROWS ;
-            }
-        }
-    }
-}
-__kernel void transpose_C1_D5(__global float* src, int src_step, int src_offset,
-                              __global float* dst, int dst_step, int dst_offset,
-                              int src_rows, int src_cols)
-{
-
-    int gp_x = get_group_id(0),   gp_y = get_group_id(1);
-    int gs_x = get_num_groups(0), gs_y = get_num_groups(1);
-
-    int groupId_x, groupId_y;
-
-    if(src_rows == src_cols)
-    {
-        groupId_y = gp_x;
-        groupId_x = (gp_x + gp_y) % gs_x;
-    }
-    else
-    {
-        int bid = gp_x + gs_x * gp_y;
-        groupId_y =  bid % gs_y;
-        groupId_x = ((bid / gs_y) + groupId_y) % gs_x;
-    }
-
-    int lx = get_local_id(0);
-    int ly = get_local_id(1);
-
-    int x = groupId_x * TILE_DIM + lx;
-    int y = groupId_y * TILE_DIM + ly;
-
-    int x_index = groupId_y * TILE_DIM + lx;
-    int y_index = groupId_x * TILE_DIM + ly;
-
-    __local float title[TILE_DIM * LDS_STEP];
-
-    if(x < src_cols && y < src_rows)
-    {
-        int index_src = mad24(y, src_step, (x << 2));
-
-        #pragma unroll
-        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
-        {
-            if(y + i < src_rows)
-            {
-                title[(ly + i) * LDS_STEP + lx] = *((__global float *)((__global char*)src + src_offset + index_src));
-                index_src = mad24(BLOCK_ROWS, src_step, index_src);
-            }
-        }
-     }
-
-     barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(x_index < src_rows && y_index < src_cols)
-    {
-        int index_dst = mad24(y_index, dst_step, (x_index << 2));
-
-        #pragma unroll
-        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
-        {
-            if((y_index + i) < src_cols)
-            {
-                *((__global float*)((__global char*)dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];
-                index_dst +=  dst_step * BLOCK_ROWS ;
-            }
-        }
-    }
-}
-
-__kernel void transpose_C2_D2(__global ushort* src, int src_step, int src_offset,
-                              __global ushort* dst, int dst_step, int dst_offset,
-                              int src_rows, int src_cols)
-{
-
-    int gp_x = get_group_id(0),   gp_y = get_group_id(1);
-    int gs_x = get_num_groups(0), gs_y = get_num_groups(1);
-
-    int groupId_x, groupId_y;
-
-    if(src_rows == src_cols)
-    {
-        groupId_y = gp_x;
-        groupId_x = (gp_x + gp_y) % gs_x;
-    }
-    else
-    {
-        int bid = gp_x + gs_x * gp_y;
-        groupId_y =  bid % gs_y;
-        groupId_x = ((bid / gs_y) + groupId_y) % gs_x;
-    }
-
-    int lx = get_local_id(0);
-    int ly = get_local_id(1);
-
-    int x = groupId_x * TILE_DIM + lx;
-    int y = groupId_y * TILE_DIM + ly;
-
-    int x_index = groupId_y * TILE_DIM + lx;
-    int y_index = groupId_x * TILE_DIM + ly;
-
-    __local ushort2 title[TILE_DIM * LDS_STEP];
-
-    if(x < src_cols && y < src_rows)
-    {
-        int index_src = mad24(y, src_step, (x << 2));
-
-        #pragma unroll
-        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
-        {
-            if(y + i < src_rows)
-            {
-                title[(ly + i) * LDS_STEP + lx] = *((__global ushort2 *)((__global char*)src + src_offset + index_src));
-                index_src = mad24(BLOCK_ROWS, src_step, index_src);
-            }
-        }
-     }
-
-     barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(x_index < src_rows && y_index < src_cols)
-    {
-        int index_dst = mad24(y_index, dst_step, (x_index << 2));
-
-        #pragma unroll
-        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
-        {
-            if((y_index + i) < src_cols)
-            {
-                *((__global ushort2*)((__global char*)dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];
-                index_dst +=  dst_step * BLOCK_ROWS ;
-            }
-        }
-    }
-}
-__kernel void transpose_C2_D3(__global short* src, int src_step, int src_offset,
-                              __global short* dst, int dst_step, int dst_offset,
-                              int src_rows, int src_cols)
-{
-
-    int gp_x = get_group_id(0),   gp_y = get_group_id(1);
-    int gs_x = get_num_groups(0), gs_y = get_num_groups(1);
-
-    int groupId_x, groupId_y;
-
-    if(src_rows == src_cols)
-    {
-        groupId_y = gp_x;
-        groupId_x = (gp_x + gp_y) % gs_x;
-    }
-    else
-    {
-        int bid = gp_x + gs_x * gp_y;
-        groupId_y =  bid % gs_y;
-        groupId_x = ((bid / gs_y) + groupId_y) % gs_x;
-    }
-
-    int lx = get_local_id(0);
-    int ly = get_local_id(1);
-
-    int x = groupId_x * TILE_DIM + lx;
-    int y = groupId_y * TILE_DIM + ly;
-
-    int x_index = groupId_y * TILE_DIM + lx;
-    int y_index = groupId_x * TILE_DIM + ly;
-
-    __local short2 title[TILE_DIM * LDS_STEP];
-
-    if(x < src_cols && y < src_rows)
-    {
-        int index_src = mad24(y, src_step, (x << 2));
-
-        #pragma unroll
-        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
-        {
-            if(y + i < src_rows)
-            {
-                title[(ly + i) * LDS_STEP + lx] = *((__global short2 *)((__global char*)src + src_offset + index_src));
-                index_src = mad24(BLOCK_ROWS, src_step, index_src);
-            }
-        }
-     }
-
-     barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(x_index < src_rows && y_index < src_cols)
-    {
-        int index_dst = mad24(y_index, dst_step, (x_index << 2));
-
-        #pragma unroll
-        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
-        {
-            if((y_index + i) < src_cols)
-            {
-                *((__global short2*)((__global char*)dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];
-                index_dst +=  dst_step * BLOCK_ROWS ;
-            }
-        }
-    }
-}
-__kernel void transpose_C4_D0(__global uchar* src, int src_step, int src_offset,
-                              __global uchar* dst, int dst_step, int dst_offset,
-                              int src_rows, int src_cols)
-{
-
-    int gp_x = get_group_id(0),   gp_y = get_group_id(1);
-    int gs_x = get_num_groups(0), gs_y = get_num_groups(1);
-
-    int groupId_x, groupId_y;
-
-    if(src_rows == src_cols)
-    {
-        groupId_y = gp_x;
-        groupId_x = (gp_x + gp_y) % gs_x;
-    }
-    else
-    {
-        int bid = gp_x + gs_x * gp_y;
-        groupId_y =  bid % gs_y;
-        groupId_x = ((bid / gs_y) + groupId_y) % gs_x;
-    }
-
-    int lx = get_local_id(0);
-    int ly = get_local_id(1);
-
-    int x = groupId_x * TILE_DIM + lx;
-    int y = groupId_y * TILE_DIM + ly;
-
-    int x_index = groupId_y * TILE_DIM + lx;
-    int y_index = groupId_x * TILE_DIM + ly;
-
-    __local uchar4 title[TILE_DIM * LDS_STEP];
-
-    if(x < src_cols && y < src_rows)
-    {
-        int index_src = mad24(y, src_step, (x << 2));
-
-        #pragma unroll
-        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
-        {
-            if(y + i < src_rows)
-            {
-                title[(ly + i) * LDS_STEP + lx] = *((__global uchar4 *)(src + src_offset + index_src));
-                index_src = mad24(BLOCK_ROWS, src_step, index_src);
-            }
-        }
-     }
-
-     barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(x_index < src_rows && y_index < src_cols)
-    {
-        int index_dst = mad24(y_index, dst_step, (x_index << 2));
-
-        #pragma unroll
-        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
-        {
-            if((y_index + i) < src_cols)
-            {
-                *((__global uchar4*)(dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];
-                index_dst +=  dst_step * BLOCK_ROWS ;
-            }
-        }
-    }
-}
-
-__kernel void transpose_C4_D1(__global char* src, int src_step, int src_offset,
-                              __global char* dst, int dst_step, int dst_offset,
-                              int src_rows, int src_cols)
-{
-
-    int gp_x = get_group_id(0),   gp_y = get_group_id(1);
-    int gs_x = get_num_groups(0), gs_y = get_num_groups(1);
-
-    int groupId_x, groupId_y;
-
-    if(src_rows == src_cols)
-    {
-        groupId_y = gp_x;
-        groupId_x = (gp_x + gp_y) % gs_x;
-    }
-    else
-    {
-        int bid = gp_x + gs_x * gp_y;
-        groupId_y =  bid % gs_y;
-        groupId_x = ((bid / gs_y) + groupId_y) % gs_x;
-    }
-
-    int lx = get_local_id(0);
-    int ly = get_local_id(1);
-
-    int x = groupId_x * TILE_DIM + lx;
-    int y = groupId_y * TILE_DIM + ly;
-
-    int x_index = groupId_y * TILE_DIM + lx;
-    int y_index = groupId_x * TILE_DIM + ly;
-
-    __local char4 title[TILE_DIM * LDS_STEP];
-
-    if(x < src_cols && y < src_rows)
-    {
-        int index_src = mad24(y, src_step, (x << 2));
-
-        #pragma unroll
-        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
-        {
-            if(y + i < src_rows)
-            {
-                title[(ly + i) * LDS_STEP + lx] = *((__global char4 *)(src + src_offset + index_src));
-                index_src = mad24(BLOCK_ROWS, src_step, index_src);
-            }
-        }
-     }
-
-     barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(x_index < src_rows && y_index < src_cols)
-    {
-        int index_dst = mad24(y_index, dst_step, (x_index << 2));
-
-        #pragma unroll
-        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
-        {
-            if((y_index + i) < src_cols)
-            {
-                *((__global char4*)(dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];
-                index_dst +=  dst_step * BLOCK_ROWS ;
-            }
-        }
+        T tmp = dst[dstIdx];
+        dst[dstIdx] = src[srcIdx];
+        src[srcIdx] = tmp;
     }
 }

From 8aa3eb817d4a40f418b3a7691960ec40904ecd99 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Tue, 24 Sep 2013 14:13:10 +0400
Subject: [PATCH 12/17] refactored arithm.cpp. Extended param list for all the
 tests

---
 modules/ocl/src/arithm.cpp       |   46 +-
 modules/ocl/test/test_arithm.cpp | 1209 ++++++++++++++++--------------
 2 files changed, 656 insertions(+), 599 deletions(-)

diff --git a/modules/ocl/src/arithm.cpp b/modules/ocl/src/arithm.cpp
index b18fa44dc..035cea781 100644
--- a/modules/ocl/src/arithm.cpp
+++ b/modules/ocl/src/arithm.cpp
@@ -97,9 +97,6 @@ namespace cv
         extern const char *arithm_setidentity;
     }
 }
-//////////////////////////////////////////////////////////////////////////////
-/////////////////////// add subtract multiply divide /////////////////////////
-//////////////////////////////////////////////////////////////////////////////
 
 //////////////////////////////////////////////////////////////////////////////
 /////////////////////// add subtract multiply divide /////////////////////////
@@ -251,7 +248,7 @@ void cv::ocl::divide(double scalar, const oclMat &src, oclMat &dst)
 }
 
 //////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////  Absdiff ///////////////////////////////////
+///////////////////////////////// Absdiff ////////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////
 
 void cv::ocl::absdiff(const oclMat &src1, const oclMat &src2, oclMat &dst)
@@ -430,9 +427,11 @@ Scalar cv::ocl::sqrSum(const oclMat &src)
     func = functab[(int)src.clCxt->supportsFeature(Context::CL_DOUBLE)];
     return func(src, 2);
 }
+
 //////////////////////////////////////////////////////////////////////////////
 //////////////////////////////// meanStdDev //////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////
+
 void cv::ocl::meanStdDev(const oclMat &src, Scalar &mean, Scalar &stddev)
 {
     CV_Assert(src.depth() <= CV_32S);
@@ -441,8 +440,10 @@ void cv::ocl::meanStdDev(const oclMat &src, Scalar &mean, Scalar &stddev)
     Mat m1(sz, CV_MAKETYPE(CV_32S, channels), cv::Scalar::all(0)),
         m2(sz, CV_MAKETYPE(CV_32S, channels), cv::Scalar::all(0));
     oclMat dst1(m1), dst2(m2);
+
     //arithmetic_sum_run(src, dst1,"arithm_op_sum");
     //arithmetic_sum_run(src, dst2,"arithm_op_squares_sum");
+
     m1 = (Mat)dst1;
     m2 = (Mat)dst2;
     int i = 0, *p = (int *)m1.data, *q = (int *)m2.data;
@@ -456,6 +457,7 @@ void cv::ocl::meanStdDev(const oclMat &src, Scalar &mean, Scalar &stddev)
 //////////////////////////////////////////////////////////////////////////////
 //////////////////////////////////// minMax  /////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////
+
 static void arithmetic_minMax_run(const oclMat &src, const oclMat &mask, cl_mem &dst, int vlen , int groupnum, string kernelName)
 {
     vector<pair<size_t , const void *> > args;
@@ -572,6 +574,7 @@ void cv::ocl::minMax(const oclMat &src, double *minVal, double *maxVal, const oc
     oclMat buf;
     minMax_buf(src, minVal, maxVal, mask, buf);
 }
+
 void cv::ocl::minMax_buf(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask, oclMat &buf)
 {
     CV_Assert(src.oclchannels() == 1);
@@ -598,6 +601,7 @@ void cv::ocl::minMax_buf(const oclMat &src, double *minVal, double *maxVal, cons
 //////////////////////////////////////////////////////////////////////////////
 /////////////////////////////////// norm /////////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////
+
 double cv::ocl::norm(const oclMat &src1, int normType)
 {
     return norm(src1, oclMat(src1.size(), src1.type(), Scalar::all(0)), normType);
@@ -657,6 +661,7 @@ double cv::ocl::norm(const oclMat &src1, const oclMat &src2, int normType)
 //////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////// flip //////////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////
+
 static void arithmetic_flip_rows_run(const oclMat &src, oclMat &dst, string kernelName)
 {
     if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F)
@@ -703,6 +708,7 @@ static void arithmetic_flip_rows_run(const oclMat &src, oclMat &dst, string kern
 
     openCLExecuteKernel(clCxt, &arithm_flip, kernelName, globalThreads, localThreads, args, -1, depth);
 }
+
 static void arithmetic_flip_cols_run(const oclMat &src, oclMat &dst, string kernelName, bool isVertical)
 {
     if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F)
@@ -755,6 +761,7 @@ static void arithmetic_flip_cols_run(const oclMat &src, oclMat &dst, string kern
 
     openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, src.oclchannels(), depth);
 }
+
 void cv::ocl::flip(const oclMat &src, oclMat &dst, int flipCode)
 {
     dst.create(src.size(), src.type());
@@ -868,6 +875,7 @@ void cv::ocl::log(const oclMat &src, oclMat &dst)
 //////////////////////////////////////////////////////////////////////////////
 ////////////////////////////// magnitude phase ///////////////////////////////
 //////////////////////////////////////////////////////////////////////////////
+
 static void arithmetic_magnitude_phase_run(const oclMat &src1, const oclMat &src2, oclMat &dst, string kernelName)
 {
     if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
@@ -951,6 +959,7 @@ static void arithmetic_phase_run(const oclMat &src1, const oclMat &src2, oclMat
 
     openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth);
 }
+
 void cv::ocl::phase(const oclMat &x, const oclMat &y, oclMat &Angle , bool angleInDegrees)
 {
     CV_Assert(x.type() == y.type() && x.size() == y.size() && (x.depth() == CV_32F || x.depth() == CV_64F));
@@ -965,6 +974,7 @@ void cv::ocl::phase(const oclMat &x, const oclMat &y, oclMat &Angle , bool angle
 //////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////// cartToPolar ///////////////////////////////
 //////////////////////////////////////////////////////////////////////////////
+
 static void arithmetic_cartToPolar_run(const oclMat &src1, const oclMat &src2, oclMat &dst_mag, oclMat &dst_cart,
                                 string kernelName, bool angleInDegrees)
 {
@@ -1003,6 +1013,7 @@ static void arithmetic_cartToPolar_run(const oclMat &src1, const oclMat &src2, o
 
     openCLExecuteKernel(clCxt, &arithm_cartToPolar, kernelName, globalThreads, localThreads, args, -1, depth);
 }
+
 void cv::ocl::cartToPolar(const oclMat &x, const oclMat &y, oclMat &mag, oclMat &angle, bool angleInDegrees)
 {
     CV_Assert(x.type() == y.type() && x.size() == y.size() && (x.depth() == CV_32F || x.depth() == CV_64F));
@@ -1016,6 +1027,7 @@ void cv::ocl::cartToPolar(const oclMat &x, const oclMat &y, oclMat &mag, oclMat
 //////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////// polarToCart ///////////////////////////////
 //////////////////////////////////////////////////////////////////////////////
+
 static void arithmetic_ptc_run(const oclMat &src1, const oclMat &src2, oclMat &dst1, oclMat &dst2, bool angleInDegrees,
                         string kernelName)
 {
@@ -1078,6 +1090,7 @@ void cv::ocl::polarToCart(const oclMat &magnitude, const oclMat &angle, oclMat &
 //////////////////////////////////////////////////////////////////////////////
 /////////////////////////////////// minMaxLoc ////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////
+
 static void arithmetic_minMaxLoc_run(const oclMat &src, cl_mem &dst, int vlen , int groupnum)
 {
     vector<pair<size_t , const void *> > args;
@@ -1127,8 +1140,7 @@ static void arithmetic_minMaxLoc_mask_run(const oclMat &src, const oclMat &mask,
         args.push_back( make_pair( sizeof(cl_int) , (void *)&moffset ));
         args.push_back( make_pair( sizeof(cl_mem) , (void *)&mask.data ));
         args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst ));
-        //    printf("elemnum:%d,cols:%d,invalid_cols:%d,offset:%d,minvalid_cols:%d,moffset:%d,repeat_e:%d\r\n",
-        //           elemnum,cols,invalid_cols,offset,minvalid_cols,moffset,repeat_me);
+
         openCLExecuteKernel(src.clCxt, &arithm_minMaxLoc_mask, "arithm_op_minMaxLoc_mask", gt, lt, args, -1, -1, build_options);
     }
 }
@@ -1144,14 +1156,12 @@ void arithmetic_minMaxLoc(const oclMat &src, double *minVal, double *maxVal,
     Context *clCxt = src.clCxt;
     cl_mem dstBuffer = openCLCreateBuffer(clCxt, CL_MEM_WRITE_ONLY, dbsize);
     *minVal = std::numeric_limits<double>::max() , *maxVal = -std::numeric_limits<double>::max();
+
     if (mask.empty())
-    {
         arithmetic_minMaxLoc_run(src, dstBuffer, vlen, groupnum);
-    }
     else
-    {
         arithmetic_minMaxLoc_mask_run(src, mask, dstBuffer, vlen, groupnum);
-    }
+
     T *p = new T[groupnum * vlen * 4];
     memset(p, 0, dbsize);
     openCLReadBuffer(clCxt, dstBuffer, (void *)p, dbsize);
@@ -1190,18 +1200,22 @@ void arithmetic_minMaxLoc(const oclMat &src, double *minVal, double *maxVal,
             maxLoc->x = maxLoc->y = -1;
     }
     delete[] p;
+
     openCLSafeCall(clReleaseMemObject(dstBuffer));
 }
 
 typedef void (*minMaxLocFunc)(const oclMat &src, double *minVal, double *maxVal,
                               Point *minLoc, Point *maxLoc, const oclMat &mask);
+
 void cv::ocl::minMaxLoc(const oclMat &src, double *minVal, double *maxVal,
                         Point *minLoc, Point *maxLoc, const oclMat &mask)
 {
     if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
     {
         CV_Error(CV_GpuNotSupported, "select device don't support double");
+        return;
     }
+
     static minMaxLocFunc functab[2] =
     {
         arithmetic_minMaxLoc<float>,
@@ -1216,6 +1230,7 @@ void cv::ocl::minMaxLoc(const oclMat &src, double *minVal, double *maxVal,
 //////////////////////////////////////////////////////////////////////////////
 ///////////////////////////// countNonZero ///////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////
+
 static void arithmetic_countNonZero_run(const oclMat &src, cl_mem &dst, int vlen , int groupnum, string kernelName)
 {
     vector<pair<size_t , const void *> > args;
@@ -1262,9 +1277,8 @@ int cv::ocl::countNonZero(const oclMat &src)
     memset(p, 0, dbsize * sizeof(int));
     openCLReadBuffer(clCxt, dstBuffer, (void *)p, dbsize * sizeof(int));
     for(int i = 0; i < dbsize; i++)
-    {
         nonzero += p[i];
-    }
+
     delete[] p;
     openCLSafeCall(clReleaseMemObject(dstBuffer));
     return nonzero;
@@ -1663,11 +1677,10 @@ static void arithmetic_pow_run(const oclMat &src1, double p, oclMat &dst, string
     args.push_back( make_pair( sizeof(cl_int), (void *)&dst.rows ));
     args.push_back( make_pair( sizeof(cl_int), (void *)&cols ));
     args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 ));
-    float pf = p;
+
+    float pf = static_cast<float>(p);
     if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE))
-    {
         args.push_back( make_pair( sizeof(cl_float), (void *)&pf ));
-    }
     else
         args.push_back( make_pair( sizeof(cl_double), (void *)&p ));
 
@@ -1733,7 +1746,8 @@ void cv::ocl::setIdentity(oclMat& src, double scalar)
         {
             scalar_i = (int)scalar;
             args.push_back(make_pair(sizeof(cl_int), (void*)&scalar_i));
-        }else
+        }
+        else
             args.push_back(make_pair(sizeof(cl_double), (void*)&scalar));
     }
     else
diff --git a/modules/ocl/test/test_arithm.cpp b/modules/ocl/test/test_arithm.cpp
index 43afd1342..9b20dbf89 100644
--- a/modules/ocl/test/test_arithm.cpp
+++ b/modules/ocl/test/test_arithm.cpp
@@ -50,10 +50,6 @@
 //
 //M*/
 
-//#define PRINT_CPU_TIME 1000
-//#define PRINT_TIME
-
-
 #include "test_precomp.hpp"
 #include <iomanip>
 
@@ -65,392 +61,506 @@ using namespace cvtest;
 using namespace testing;
 using namespace std;
 
-PARAM_TEST_CASE(ArithmTestBase, MatType, bool)
+//////////////////////////////// LUT /////////////////////////////////////////////////
+
+PARAM_TEST_CASE(Lut, int, int, bool, bool)
 {
-    int type;
-    cv::Scalar val;
+    int lut_depth;
+    int cn;
+    bool use_roi, same_cn;
 
-    //src mat
-    cv::Mat mat1;
-    cv::Mat mat2;
-    cv::Mat mask;
+    // src mat
+    cv::Mat src;
+    cv::Mat lut;
     cv::Mat dst;
-    cv::Mat dst1; //bak, for two outputs
 
-    // set up roi
-    int roicols;
-    int roirows;
-    int src1x;
-    int src1y;
-    int src2x;
-    int src2y;
-    int dstx;
-    int dsty;
-    int maskx;
-    int masky;
-
-    //src mat with roi
-    cv::Mat mat1_roi;
-    cv::Mat mat2_roi;
-    cv::Mat mask_roi;
+    // src mat with roi
+    cv::Mat src_roi;
+    cv::Mat lut_roi;
     cv::Mat dst_roi;
-    cv::Mat dst1_roi; //bak
 
-    //ocl dst mat for testing
+    // ocl dst mat for testing
     cv::ocl::oclMat gdst_whole;
-    cv::ocl::oclMat gdst1_whole; //bak
 
-    //ocl mat with roi
-    cv::ocl::oclMat gmat1;
-    cv::ocl::oclMat gmat2;
+    // ocl mat with roi
+    cv::ocl::oclMat gsrc;
+    cv::ocl::oclMat glut;
     cv::ocl::oclMat gdst;
-    cv::ocl::oclMat gdst1;   //bak
-    cv::ocl::oclMat gmask;
 
     virtual void SetUp()
     {
-        type = GET_PARAM(0);
+        lut_depth = GET_PARAM(0);
+        cn = GET_PARAM(1);
+        same_cn = GET_PARAM(2);
+        use_roi = GET_PARAM(3);
+
+        const int src_type = CV_MAKE_TYPE(CV_8U, cn);
+        const int lut_type = CV_MAKE_TYPE(lut_depth, same_cn ? cn : 1);
+        const int dst_type = CV_MAKE_TYPE(lut_depth, cn);
 
         cv::RNG &rng = TS::ptr()->get_rng();
 
-        cv::Size size(MWIDTH, MHEIGHT);
-
-        mat1 = randomMat(rng, size, type, 5, 16, false);
-        //mat2 = randomMat(rng, size, type, 5, 16, false);
-        mat2 = randomMat(rng, size, type, 5, 16, false);
-        dst  = randomMat(rng, size, type, 5, 16, false);
-        dst1  = randomMat(rng, size, type, 5, 16, false);
-        mask = randomMat(rng, size, CV_8UC1, 0, 2,  false);
-
-        cv::threshold(mask, mask, 0.5, 255., CV_8UC1);
-
-        val = cv::Scalar(rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0));
-
+        src = randomMat(rng, randomSize(MIN_VALUE, MAX_VALUE), src_type, 0, 256, false);
+        lut = randomMat(rng, use_roi ? randomSize(260, 300) : Size(256, 1), lut_type, 5, 16, false);
+        dst = randomMat(rng, use_roi ? randomSize(MIN_VALUE, MAX_VALUE) : src.size(), dst_type, 5, 16, false);
     }
 
     void random_roi()
     {
-#ifdef RANDOMROI
-        //randomize ROI
-        cv::RNG &rng = TS::ptr()->get_rng();
-        roicols = rng.uniform(1, mat1.cols);
-        roirows = rng.uniform(1, mat1.rows);
-        src1x   = rng.uniform(0, mat1.cols - roicols);
-        src1y   = rng.uniform(0, mat1.rows - roirows);
-        dstx    = rng.uniform(0, dst.cols  - roicols);
-        dsty    = rng.uniform(0, dst.rows  - roirows);
-        maskx   = rng.uniform(0, mask.cols - roicols);
-        masky   = rng.uniform(0, mask.rows - roirows);
-        src2x   = rng.uniform(0, mat2.cols - roicols);
-        src2y   = rng.uniform(0, mat2.rows - roirows);
-#else
-        roicols = mat1.cols;
-        roirows = mat1.rows;
-        src1x = 0;
-        src1y = 0;
-        dstx = 0;
-        dsty = 0;
-        maskx   = 0;
-        masky   = 0;
-        src2x   = 0;
-        src2y   = 0;
-#endif
-        mat1_roi = mat1(Rect(src1x, src1y, roicols, roirows));
-        mat2_roi = mat2(Rect(src2x, src2y, roicols, roirows));
-        mask_roi = mask(Rect(maskx, masky, roicols, roirows));
-        dst_roi  = dst(Rect(dstx, dsty, roicols, roirows));
-        dst1_roi = dst1(Rect(dstx, dsty, roicols, roirows));
+        // set up roi
+        int roicols, roirows;
+        int srcx, srcy;
+        int lutx, luty;
+        int dstx, dsty;
+
+        if (use_roi)
+        {
+            // randomize ROI
+            cv::RNG &rng = TS::ptr()->get_rng();
+
+            roicols = rng.uniform(1, MIN_VALUE);
+            roirows = rng.uniform(1, MIN_VALUE);
+
+            srcx = rng.uniform(0, src.cols - roicols);
+            srcy = rng.uniform(0, src.rows - roirows);
+            lutx = rng.uniform(0, lut.cols - 256);
+            luty = rng.uniform(0, lut.rows - 1);
+
+            dstx = rng.uniform(0, dst.cols - roicols);
+            dsty = rng.uniform(0, dst.rows - roirows);
+        }
+        else
+        {
+            roicols = src.cols;
+            roirows = src.rows;
+            srcx = srcy = 0;
+            lutx = luty = 0;
+            dstx = dsty = 0;
+        }
+
+        src_roi = src(Rect(srcx, srcy, roicols, roirows));
+        lut_roi = lut(Rect(lutx, luty, 256, 1));
+        dst_roi = dst(Rect(dstx, dsty, roicols, roirows));
 
         gdst_whole = dst;
         gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
 
-        gdst1_whole = dst1;
-        gdst1 = gdst1_whole(Rect(dstx, dsty, roicols, roirows));
-
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
-        gmask = mask_roi; //end
+        gsrc = src_roi;
+        glut = lut_roi;
     }
 
     void Near(double threshold = 0.)
     {
         EXPECT_MAT_NEAR(dst, Mat(gdst_whole), threshold);
+        EXPECT_MAT_NEAR(dst_roi, Mat(gdst), threshold);
+    }
+};
+
+TEST_P(Lut, Mat)
+{
+    for (int j = 0; j < LOOP_TIMES; j++)
+    {
+        random_roi();
+
+        cv::LUT(src_roi, lut_roi, dst_roi);
+        cv::ocl::LUT(gsrc, glut, gdst);
+
+        Near();
+    }
+}
+
+///////////////////////// ArithmTestBase ///////////////////////////
+
+PARAM_TEST_CASE(ArithmTestBase, int, int, bool)
+{
+    int depth;
+    int cn;
+    bool use_roi;
+    cv::Scalar val;
+
+    // src mat
+    cv::Mat src1;
+    cv::Mat src2;
+    cv::Mat mask;
+    cv::Mat dst1;
+    cv::Mat dst2; // for two outputs
+
+    // set up roi
+    int roicols, roirows;
+    int src1x, src1y;
+    int src2x, src2y;
+    int dst1x, dst1y;
+    int dst2x, dst2y;
+    int maskx, masky;
+
+    // src mat with roi
+    cv::Mat src1_roi;
+    cv::Mat src2_roi;
+    cv::Mat mask_roi;
+    cv::Mat dst1_roi;
+    cv::Mat dst2_roi; // for two outputs
+
+    // ocl dst mat for testing
+    cv::ocl::oclMat gdst1_whole;
+    cv::ocl::oclMat gdst2_whole; // for two outputs
+
+    // ocl mat with roi
+    cv::ocl::oclMat gsrc1;
+    cv::ocl::oclMat gsrc2;
+    cv::ocl::oclMat gdst1;
+    cv::ocl::oclMat gdst2; // for two outputs
+    cv::ocl::oclMat gmask;
+
+    virtual void SetUp()
+    {
+        depth = GET_PARAM(0);
+        cn = GET_PARAM(1);
+        use_roi = GET_PARAM(2);
+        const int type = CV_MAKE_TYPE(depth, cn);
+
+        cv::RNG &rng = TS::ptr()->get_rng();
+
+        src1 = randomMat(rng, randomSize(MIN_VALUE, MAX_VALUE), type, 5, 16, false);
+        src2 = randomMat(rng, !use_roi ? src1.size() : randomSize(MIN_VALUE, MAX_VALUE), type, -15440, 14450, false);
+        dst1 = randomMat(rng, !use_roi ? src1.size() : randomSize(MIN_VALUE, MAX_VALUE), type, 5, 16, false);
+        dst2 = randomMat(rng, !use_roi ? src1.size() : randomSize(MIN_VALUE, MAX_VALUE), type, 5, 16, false);
+        mask = randomMat(rng, !use_roi ? src1.size() : randomSize(MIN_VALUE, MAX_VALUE), CV_8UC1, 0, 2, false);
+
+        cv::threshold(mask, mask, 0.5, 255., CV_8UC1);
+
+        val = cv::Scalar(rng.uniform(-100.0, 100.0), rng.uniform(-100.0, 100.0),
+                         rng.uniform(-100.0, 100.0), rng.uniform(-100.0, 100.0));
+    }
+
+    void random_roi()
+    {
+        if (use_roi)
+        {
+            // randomize ROI
+            cv::RNG &rng = TS::ptr()->get_rng();
+
+            roicols = rng.uniform(1, MIN_VALUE);
+            roirows = rng.uniform(1, MIN_VALUE);
+
+            src1x = rng.uniform(0, src1.cols - roicols);
+            src1y = rng.uniform(0, src1.rows - roirows);
+            src2x = rng.uniform(0, src2.cols - roicols);
+            src2y = rng.uniform(0, src2.rows - roirows);
+
+            dst1x = rng.uniform(0, dst1.cols - roicols);
+            dst1y = rng.uniform(0, dst1.rows - roirows);
+            dst2x = rng.uniform(0, dst2.cols - roicols);
+            dst2y = rng.uniform(0, dst2.rows - roirows);
+
+            maskx = rng.uniform(0, mask.cols - roicols);
+            masky = rng.uniform(0, mask.rows - roirows);
+        }
+        else
+        {
+            roicols = src1.cols;
+            roirows = src1.rows;
+            src1x = src1y = 0;
+            src2x = src2y = 0;
+            dst1x = dst1y = 0;
+            dst2x = dst2y = 0;
+            maskx = masky = 0;
+        }
+
+        src1_roi = src1(Rect(src1x, src1y, roicols, roirows));
+        src2_roi = src2(Rect(src2x, src2y, roicols, roirows));
+        mask_roi = mask(Rect(maskx, masky, roicols, roirows));
+        dst1_roi = dst1(Rect(dst1x, dst1y, roicols, roirows));
+        dst2_roi = dst2(Rect(dst2x, dst2y, roicols, roirows));
+
+        gdst1_whole = dst1;
+        gdst1 = gdst1_whole(Rect(dst1x, dst1y, roicols, roirows));
+
+        gdst2_whole = dst2;
+        gdst2 = gdst2_whole(Rect(dst2x, dst2y, roicols, roirows));
+
+        gsrc1 = src1_roi;
+        gsrc2 = src2_roi;
+        gmask = mask_roi;
+    }
+
+    void Near(double threshold = 0.)
+    {
+        EXPECT_MAT_NEAR(dst1, Mat(gdst1_whole), threshold);
+        EXPECT_MAT_NEAR(dst1_roi, Mat(gdst1), threshold);
     }
 
     void Near1(double threshold = 0.)
     {
-        EXPECT_MAT_NEAR(dst1, Mat(gdst1_whole), threshold);
+        EXPECT_MAT_NEAR(dst2, Mat(gdst2_whole), threshold);
+        EXPECT_MAT_NEAR(dst2_roi, Mat(gdst2), threshold);
     }
-
 };
-////////////////////////////////lut/////////////////////////////////////////////////
-struct Lut : ArithmTestBase {};
-#define VARNAME(A) string(#A);
 
+//////////////////////////////// Exp /////////////////////////////////////////////////
 
-TEST_P(Lut, Mat)
-{
-
-    cv::Mat mat2(3, 512, CV_8UC1);
-    cv::RNG &rng = TS::ptr()->get_rng();
-    rng.fill(mat2, cv::RNG::UNIFORM, cv::Scalar::all(0), cv::Scalar::all(256));
-
-    for(int j = 0; j < LOOP_TIMES; j ++)
-    {
-        random_roi();
-
-        src2x = rng.uniform( 0, mat2.cols - 256);
-        src2y = rng.uniform (0, mat2.rows - 1);
-
-        cv::Mat mat2_roi = mat2(Rect(src2x, src2y, 256, 1));
-
-        cv::ocl::oclMat gmat2(mat2_roi);
-
-        cv::LUT(mat1_roi, mat2_roi, dst_roi);
-        cv::ocl::LUT(gmat1, gmat2, gdst);
-        Near(0);
-    }
-}
-
-
-////////////////////////////////exp/////////////////////////////////////////////////
-struct Exp : ArithmTestBase {};
+typedef ArithmTestBase Exp;
 
 TEST_P(Exp, Mat)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::exp(mat1_roi, dst_roi);
-        cv::ocl::exp(gmat1, gdst);
+        cv::exp(src1_roi, dst1_roi);
+        cv::ocl::exp(gsrc1, gdst1);
+
         Near(2);
     }
 }
 
+//////////////////////////////// Log /////////////////////////////////////////////////
 
-////////////////////////////////log/////////////////////////////////////////////////
-struct Log : ArithmTestBase {};
+typedef ArithmTestBase Log;
 
 TEST_P(Log, Mat)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::log(mat1_roi, dst_roi);
-        cv::ocl::log(gmat1, gdst);
+        cv::log(src1_roi, dst1_roi);
+        cv::ocl::log(gsrc1, gdst1);
         Near(1);
     }
 }
 
+//////////////////////////////// Add /////////////////////////////////////////////////
 
-////////////////////////////////add/////////////////////////////////////////////////
-struct Add : ArithmTestBase {};
+typedef ArithmTestBase Add;
 
 TEST_P(Add, Mat)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::add(mat1_roi, mat2_roi, dst_roi);
-        cv::ocl::add(gmat1, gmat2, gdst);
+        cv::add(src1_roi, src2_roi, dst1_roi);
+        cv::ocl::add(gsrc1, gsrc2, gdst1);
         Near(0);
     }
 }
 
 TEST_P(Add, Mat_Mask)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::add(mat1_roi, mat2_roi, dst_roi, mask_roi);
-        cv::ocl::add(gmat1, gmat2, gdst, gmask);
+        cv::add(src1_roi, src2_roi, dst1_roi, mask_roi);
+        cv::ocl::add(gsrc1, gsrc2, gdst1, gmask);
         Near(0);
     }
 }
 
 TEST_P(Add, Scalar)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::add(mat1_roi, val, dst_roi);
-        cv::ocl::add(gmat1, val, gdst);
+        cv::add(src1_roi, val, dst1_roi);
+        cv::ocl::add(gsrc1, val, gdst1);
         Near(1e-5);
     }
 }
 
 TEST_P(Add, Scalar_Mask)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::add(mat1_roi, val, dst_roi, mask_roi);
-        cv::ocl::add(gmat1, val, gdst, gmask);
+        cv::add(src1_roi, val, dst1_roi, mask_roi);
+        cv::ocl::add(gsrc1, val, gdst1, gmask);
         Near(1e-5);
     }
 }
 
+//////////////////////////////// Sub /////////////////////////////////////////////////
 
-
-////////////////////////////////sub/////////////////////////////////////////////////
-struct Sub : ArithmTestBase {};
+typedef ArithmTestBase Sub;
 
 TEST_P(Sub, Mat)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::subtract(mat1_roi, mat2_roi, dst_roi);
-        cv::ocl::subtract(gmat1, gmat2, gdst);
+        cv::subtract(src1_roi, src2_roi, dst1_roi);
+        cv::ocl::subtract(gsrc1, gsrc2, gdst1);
+
         Near(0);
     }
 }
 
 TEST_P(Sub, Mat_Mask)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::subtract(mat1_roi, mat2_roi, dst_roi, mask_roi);
-        cv::ocl::subtract(gmat1, gmat2, gdst, gmask);
+        cv::subtract(src1_roi, src2_roi, dst1_roi, mask_roi);
+        cv::ocl::subtract(gsrc1, gsrc2, gdst1, gmask);
         Near(0);
     }
 }
 
 TEST_P(Sub, Scalar)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::subtract(mat1_roi, val, dst_roi);
-        cv::ocl::subtract(gmat1, val, gdst);
+        cv::subtract(src1_roi, val, dst1_roi);
+        cv::ocl::subtract(gsrc1, val, gdst1);
+
         Near(1e-5);
     }
 }
 
 TEST_P(Sub, Scalar_Mask)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::subtract(mat1_roi, val, dst_roi, mask_roi);
-        cv::ocl::subtract(gmat1, val, gdst, gmask);
+        cv::subtract(src1_roi, val, dst1_roi, mask_roi);
+        cv::ocl::subtract(gsrc1, val, gdst1, gmask);
         Near(1e-5);
     }
 }
 
+//////////////////////////////// Mul /////////////////////////////////////////////////
 
-
-////////////////////////////////Mul/////////////////////////////////////////////////
-struct Mul : ArithmTestBase {};
+typedef ArithmTestBase Mul;
 
 TEST_P(Mul, Mat)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::multiply(mat1_roi, mat2_roi, dst_roi);
-        cv::ocl::multiply(gmat1, gmat2, gdst);
+        cv::multiply(src1_roi, src2_roi, dst1_roi);
+        cv::ocl::multiply(gsrc1, gsrc2, gdst1);
         Near(0);
     }
 }
 
-TEST_P(Mul, Mat_Scalar)
+TEST_P(Mul, Scalar)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::RNG &rng = TS::ptr()->get_rng();
-        double s = rng.uniform(-10.0, 10.0);
+        cv::multiply(val[0], src1_roi, dst1_roi);
+        cv::ocl::multiply(val[0], gsrc1, gdst1);
 
-        cv::multiply(mat1_roi, mat2_roi, dst_roi, s);
-        cv::ocl::multiply(gmat1, gmat2, gdst, s);
-        Near(.001);
+        Near(gdst1.depth() >= CV_32F ? 1e-3 : 1);
     }
 }
 
 
-
-struct Div : ArithmTestBase {};
-
-TEST_P(Div, Mat)
+TEST_P(Mul, Mat_Scalar)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::divide(mat1_roi, mat2_roi, dst_roi);
-        cv::ocl::divide(gmat1, gmat2, gdst);
+        cv::multiply(src1_roi, src2_roi, dst1_roi, val[0]);
+        cv::ocl::multiply(gsrc1, gsrc2, gdst1, val[0]);
+
+        Near(gdst1.depth() >= CV_32F ? 1e-3 : 1);
+    }
+}
+
+//////////////////////////////// Div /////////////////////////////////////////////////
+
+typedef ArithmTestBase Div;
+
+TEST_P(Div, Mat)
+{
+    for (int j = 0; j < LOOP_TIMES; j++)
+    {
+        random_roi();
+
+        cv::divide(src1_roi, src2_roi, dst1_roi);
+        cv::ocl::divide(gsrc1, gsrc2, gdst1);
         Near(1);
     }
 }
 
-TEST_P(Div, Mat_Scalar)
+TEST_P(Div, Scalar)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::RNG &rng = TS::ptr()->get_rng();
-        double s = rng.uniform(-10.0, 10.0);
+        cv::divide(val[0], src1_roi, dst1_roi);
+        cv::ocl::divide(val[0], gsrc1, gdst1);
 
-        cv::divide(mat1_roi, mat2_roi, dst_roi, s);
-        cv::ocl::divide(gmat1, gmat2, gdst, s);
-        Near(.001);
+        Near(gdst1.depth() >= CV_32F ? 1e-3 : 1);
     }
 }
 
 
-struct Absdiff : ArithmTestBase {};
-
-TEST_P(Absdiff, Mat)
+TEST_P(Div, Mat_Scalar)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::absdiff(mat1_roi, mat2_roi, dst_roi);
-        cv::ocl::absdiff(gmat1, gmat2, gdst);
+        cv::divide(src1_roi, src2_roi, dst1_roi, val[0]);
+        cv::ocl::divide(gsrc1, gsrc2, gdst1, val[0]);
+
+        Near(gdst1.depth() >= CV_32F ? 1e-3 : 1);
+    }
+}
+
+//////////////////////////////// Absdiff /////////////////////////////////////////////////
+
+typedef ArithmTestBase Absdiff;
+
+TEST_P(Absdiff, Mat)
+{
+    for (int j = 0; j < LOOP_TIMES; j++)
+    {
+        random_roi();
+
+        cv::absdiff(src1_roi, src2_roi, dst1_roi);
+        cv::ocl::absdiff(gsrc1, gsrc2, gdst1);
         Near(0);
     }
 }
 
 TEST_P(Absdiff, Mat_Scalar)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::absdiff(mat1_roi, val, dst_roi);
-        cv::ocl::absdiff(gmat1, val, gdst);
+        cv::absdiff(src1_roi, val, dst1_roi);
+        cv::ocl::absdiff(gsrc1, val, gdst1);
         Near(1e-5);
     }
 }
 
+//////////////////////////////// CartToPolar /////////////////////////////////////////////////
 
-
-struct CartToPolar : ArithmTestBase {};
+typedef ArithmTestBase CartToPolar;
 
 TEST_P(CartToPolar, angleInDegree)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::cartToPolar(mat1_roi, mat2_roi, dst_roi, dst1_roi, 1);
-        cv::ocl::cartToPolar(gmat1, gmat2, gdst, gdst1, 1);
+        cv::cartToPolar(src1_roi, src2_roi, dst1_roi, dst2_roi, true);
+        cv::ocl::cartToPolar(gsrc1, gsrc2, gdst1, gdst2, true);
         Near(.5);
         Near1(.5);
     }
@@ -458,28 +568,30 @@ TEST_P(CartToPolar, angleInDegree)
 
 TEST_P(CartToPolar, angleInRadians)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::cartToPolar(mat1_roi, mat2_roi, dst_roi, dst1_roi, 0);
-        cv::ocl::cartToPolar(gmat1, gmat2, gdst, gdst1, 0);
+        cv::cartToPolar(src1_roi, src2_roi, dst1_roi, dst2_roi);
+        cv::ocl::cartToPolar(gsrc1, gsrc2, gdst1, gdst2);
         Near(.5);
         Near1(.5);
     }
 }
 
+//////////////////////////////// PolarToCart /////////////////////////////////////////////////
 
-struct PolarToCart : ArithmTestBase {};
+typedef ArithmTestBase PolarToCart;
 
 TEST_P(PolarToCart, angleInDegree)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::polarToCart(mat1_roi, mat2_roi, dst_roi, dst1_roi, 1);
-        cv::ocl::polarToCart(gmat1, gmat2, gdst, gdst1, 1);
+        cv::polarToCart(src1_roi, src2_roi, dst1_roi, dst2_roi, true);
+        cv::ocl::polarToCart(gsrc1, gsrc2, gdst1, gdst2, true);
+
         Near(.5);
         Near1(.5);
     }
@@ -487,144 +599,179 @@ TEST_P(PolarToCart, angleInDegree)
 
 TEST_P(PolarToCart, angleInRadians)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::polarToCart(mat1_roi, mat2_roi, dst_roi, dst1_roi, 0);
-        cv::ocl::polarToCart(gmat1, gmat2, gdst, gdst1, 0);
+        cv::polarToCart(src1_roi, src2_roi, dst1_roi, dst2_roi);
+        cv::ocl::polarToCart(gsrc1, gsrc2, gdst1, gdst2);
+
         Near(.5);
         Near1(.5);
     }
 }
 
+//////////////////////////////// Magnitude /////////////////////////////////////////////////
 
-
-
-struct Magnitude : ArithmTestBase {};
+typedef ArithmTestBase Magnitude;
 
 TEST_P(Magnitude, Mat)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::magnitude(mat1_roi, mat2_roi, dst_roi);
-        cv::ocl::magnitude(gmat1, gmat2, gdst);
-        Near(1e-5);
+        cv::magnitude(src1_roi, src2_roi, dst1_roi);
+        cv::ocl::magnitude(gsrc1, gsrc2, gdst1);
+        Near(depth == CV_64F ? 1e-5 : 1e-2);
     }
 }
 
+//////////////////////////////// Transpose /////////////////////////////////////////////////
 
-struct Transpose : ArithmTestBase {};
+typedef ArithmTestBase Transpose;
 
 TEST_P(Transpose, Mat)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::transpose(mat1_roi, dst_roi);
-        cv::ocl::transpose(gmat1, gdst);
+        cv::transpose(src1_roi, dst1_roi);
+        cv::ocl::transpose(gsrc1, gdst1);
+
         Near(1e-5);
     }
 }
 
+TEST_P(Transpose, SquareInplace)
+{
+    cv::RNG &rng = TS::ptr()->get_rng();
+    int value = randomInt(MIN_VALUE, MAX_VALUE);
+    src1 = randomMat(rng, Size(value, value), CV_MAKE_TYPE(depth, cn), 5, 16, false);
 
-struct Flip : ArithmTestBase {};
+    if (use_roi)
+    {
+        roirows = roicols = randomInt(1, src1.cols);
+
+        src1x = randomInt(0, src1.cols - roicols);
+        src1y = randomInt(0, src1.rows - roirows);
+    }
+    else
+    {
+        roicols = roirows = src1.cols;
+        src1x = src1y = 0;
+    }
+
+    Rect r(src1x, src1y, roicols, roirows);
+    src1_roi = src1(r);
+    gdst1_whole = src1;
+    gdst1 = gdst1_whole(r);
+
+    for (int j = 0; j < LOOP_TIMES; j++)
+    {
+        cv::transpose(src1_roi, src1_roi);
+        cv::ocl::transpose(gdst1, gdst1);
+
+        EXPECT_MAT_NEAR(src1, Mat(gdst1_whole), 0.0);
+        EXPECT_MAT_NEAR(src1_roi, Mat(gdst1), 0.0);
+    }
+}
+
+//////////////////////////////// Flip /////////////////////////////////////////////////
+
+typedef ArithmTestBase Flip;
 
 TEST_P(Flip, X)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::flip(mat1_roi, dst_roi, 0);
-        cv::ocl::flip(gmat1, gdst, 0);
+        cv::flip(src1_roi, dst1_roi, 0);
+        cv::ocl::flip(gsrc1, gdst1, 0);
         Near(1e-5);
     }
 }
 
 TEST_P(Flip, Y)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::flip(mat1_roi, dst_roi, 1);
-        cv::ocl::flip(gmat1, gdst, 1);
+        cv::flip(src1_roi, dst1_roi, 1);
+        cv::ocl::flip(gsrc1, gdst1, 1);
         Near(1e-5);
     }
 }
 
 TEST_P(Flip, BOTH)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::flip(mat1_roi, dst_roi, -1);
-        cv::ocl::flip(gmat1, gdst, -1);
+        cv::flip(src1_roi, dst1_roi, -1);
+        cv::ocl::flip(gsrc1, gdst1, -1);
         Near(1e-5);
     }
 }
 
+//////////////////////////////// MinMax /////////////////////////////////////////////////
 
-struct MinMax : ArithmTestBase {};
+typedef ArithmTestBase MinMax;
 
 TEST_P(MinMax, MAT)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
-        double minVal, maxVal;
-        cv::Point minLoc, maxLoc;
 
-        if (mat1.depth() != CV_8S)
-        {
-            cv::minMaxLoc(mat1_roi, &minVal, &maxVal, &minLoc, &maxLoc);
-        }
+        double minVal, maxVal;
+
+        if (src1.depth() != CV_8S)
+            cv::minMaxIdx(src1_roi, &minVal, &maxVal, NULL, NULL);
         else
         {
             minVal = std::numeric_limits<double>::max();
             maxVal = -std::numeric_limits<double>::max();
-            for (int i = 0; i < mat1_roi.rows; ++i)
-                for (int j = 0; j < mat1_roi.cols; ++j)
+            for (int i = 0; i < src1_roi.rows; ++i)
+                for (int j = 0; j < src1_roi.cols; ++j)
                 {
-                    signed char val = mat1_roi.at<signed char>(i, j);
+                    signed char val = src1_roi.at<signed char>(i, j);
                     if (val < minVal) minVal = val;
-                    if (val > maxVal) maxVal = val;
+                    else if (val > maxVal) maxVal = val;
                 }
         }
 
         double minVal_, maxVal_;
-        cv::ocl::minMax(gmat1, &minVal_, &maxVal_);
+        cv::ocl::minMax(gsrc1, &minVal_, &maxVal_);
 
         EXPECT_DOUBLE_EQ(minVal_, minVal);
         EXPECT_DOUBLE_EQ(maxVal_, maxVal);
     }
 }
 
-TEST_P(MinMax, MASK)
+TEST_P(MinMax, DISABLED_MASK)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
+
         double minVal, maxVal;
         cv::Point minLoc, maxLoc;
 
-        if (mat1.depth() != CV_8S)
-        {
-            cv::minMaxLoc(mat1_roi, &minVal, &maxVal, &minLoc, &maxLoc, mask_roi);
-        }
+        if (src1.depth() != CV_8S)
+            cv::minMaxLoc(src1_roi, &minVal, &maxVal, &minLoc, &maxLoc, mask_roi);
         else
         {
             minVal = std::numeric_limits<double>::max();
             maxVal = -std::numeric_limits<double>::max();
-            for (int i = 0; i < mat1_roi.rows; ++i)
-                for (int j = 0; j < mat1_roi.cols; ++j)
+            for (int i = 0; i < src1_roi.rows; ++i)
+                for (int j = 0; j < src1_roi.cols; ++j)
                 {
-                    signed char val = mat1_roi.at<signed char>(i, j);
+                    signed char val = src1_roi.at<signed char>(i, j);
                     unsigned char m = mask_roi.at<unsigned char>(i, j);
                     if (val < minVal && m) minVal = val;
                     if (val > maxVal && m) maxVal = val;
@@ -632,36 +779,37 @@ TEST_P(MinMax, MASK)
         }
 
         double minVal_, maxVal_;
-        cv::ocl::minMax(gmat1, &minVal_, &maxVal_, gmask);
+        cv::ocl::minMax(gsrc1, &minVal_, &maxVal_, gmask);
 
-        EXPECT_DOUBLE_EQ(minVal_, minVal);
-        EXPECT_DOUBLE_EQ(maxVal_, maxVal);
+        EXPECT_DOUBLE_EQ(minVal, minVal_);
+        EXPECT_DOUBLE_EQ(maxVal, maxVal_);
     }
 }
 
+//////////////////////////////// MinMaxLoc /////////////////////////////////////////////////
 
-struct MinMaxLoc : ArithmTestBase {};
+typedef ArithmTestBase MinMaxLoc;
 
 TEST_P(MinMaxLoc, MAT)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
+
         double minVal, maxVal;
         cv::Point minLoc, maxLoc;
-        int depth = mat1.depth();
+        int depth = src1.depth();
+
         if (depth != CV_8S)
-        {
-            cv::minMaxLoc(mat1_roi, &minVal, &maxVal, &minLoc, &maxLoc);
-        }
+            cv::minMaxLoc(src1_roi, &minVal, &maxVal, &minLoc, &maxLoc);
         else
         {
             minVal = std::numeric_limits<double>::max();
             maxVal = -std::numeric_limits<double>::max();
-            for (int i = 0; i < mat1_roi.rows; ++i)
-                for (int j = 0; j < mat1_roi.cols; ++j)
+            for (int i = 0; i < src1_roi.rows; ++i)
+                for (int j = 0; j < src1_roi.cols; ++j)
                 {
-                    signed char val = mat1_roi.at<signed char>(i, j);
+                    signed char val = src1_roi.at<signed char>(i, j);
                     if (val < minVal)
                     {
                         minVal = val;
@@ -679,71 +827,71 @@ TEST_P(MinMaxLoc, MAT)
 
         double minVal_, maxVal_;
         cv::Point minLoc_, maxLoc_;
-        cv::ocl::minMaxLoc(gmat1, &minVal_, &maxVal_, &minLoc_, &maxLoc_, cv::ocl::oclMat());
+        cv::ocl::minMaxLoc(gsrc1, &minVal_, &maxVal_, &minLoc_, &maxLoc_, cv::ocl::oclMat());
 
         double error0 = 0., error1 = 0., minlocVal = 0., minlocVal_ = 0., maxlocVal = 0., maxlocVal_ = 0.;
-        if(depth == 0)
+        if (depth == 0)
         {
-            minlocVal = mat1_roi.at<unsigned char>(minLoc);
-            minlocVal_ = mat1_roi.at<unsigned char>(minLoc_);
-            maxlocVal = mat1_roi.at<unsigned char>(maxLoc);
-            maxlocVal_ = mat1_roi.at<unsigned char>(maxLoc_);
-            error0 = ::abs(mat1_roi.at<unsigned char>(minLoc_) - mat1_roi.at<unsigned char>(minLoc));
-            error1 = ::abs(mat1_roi.at<unsigned char>(maxLoc_) - mat1_roi.at<unsigned char>(maxLoc));
+            minlocVal = src1_roi.at<unsigned char>(minLoc);
+            minlocVal_ = src1_roi.at<unsigned char>(minLoc_);
+            maxlocVal = src1_roi.at<unsigned char>(maxLoc);
+            maxlocVal_ = src1_roi.at<unsigned char>(maxLoc_);
+            error0 = ::abs(src1_roi.at<unsigned char>(minLoc_) - src1_roi.at<unsigned char>(minLoc));
+            error1 = ::abs(src1_roi.at<unsigned char>(maxLoc_) - src1_roi.at<unsigned char>(maxLoc));
         }
-        if(depth == 1)
+        if (depth == 1)
         {
-            minlocVal = mat1_roi.at<signed char>(minLoc);
-            minlocVal_ = mat1_roi.at<signed char>(minLoc_);
-            maxlocVal = mat1_roi.at<signed char>(maxLoc);
-            maxlocVal_ = mat1_roi.at<signed char>(maxLoc_);
-            error0 = ::abs(mat1_roi.at<signed char>(minLoc_) - mat1_roi.at<signed char>(minLoc));
-            error1 = ::abs(mat1_roi.at<signed char>(maxLoc_) - mat1_roi.at<signed char>(maxLoc));
+            minlocVal = src1_roi.at<signed char>(minLoc);
+            minlocVal_ = src1_roi.at<signed char>(minLoc_);
+            maxlocVal = src1_roi.at<signed char>(maxLoc);
+            maxlocVal_ = src1_roi.at<signed char>(maxLoc_);
+            error0 = ::abs(src1_roi.at<signed char>(minLoc_) - src1_roi.at<signed char>(minLoc));
+            error1 = ::abs(src1_roi.at<signed char>(maxLoc_) - src1_roi.at<signed char>(maxLoc));
         }
-        if(depth == 2)
+        if (depth == 2)
         {
-            minlocVal = mat1_roi.at<unsigned short>(minLoc);
-            minlocVal_ = mat1_roi.at<unsigned short>(minLoc_);
-            maxlocVal = mat1_roi.at<unsigned short>(maxLoc);
-            maxlocVal_ = mat1_roi.at<unsigned short>(maxLoc_);
-            error0 = ::abs(mat1_roi.at<unsigned short>(minLoc_) - mat1_roi.at<unsigned short>(minLoc));
-            error1 = ::abs(mat1_roi.at<unsigned short>(maxLoc_) - mat1_roi.at<unsigned short>(maxLoc));
+            minlocVal = src1_roi.at<unsigned short>(minLoc);
+            minlocVal_ = src1_roi.at<unsigned short>(minLoc_);
+            maxlocVal = src1_roi.at<unsigned short>(maxLoc);
+            maxlocVal_ = src1_roi.at<unsigned short>(maxLoc_);
+            error0 = ::abs(src1_roi.at<unsigned short>(minLoc_) - src1_roi.at<unsigned short>(minLoc));
+            error1 = ::abs(src1_roi.at<unsigned short>(maxLoc_) - src1_roi.at<unsigned short>(maxLoc));
         }
-        if(depth == 3)
+        if (depth == 3)
         {
-            minlocVal = mat1_roi.at<signed short>(minLoc);
-            minlocVal_ = mat1_roi.at<signed short>(minLoc_);
-            maxlocVal = mat1_roi.at<signed short>(maxLoc);
-            maxlocVal_ = mat1_roi.at<signed short>(maxLoc_);
-            error0 = ::abs(mat1_roi.at<signed short>(minLoc_) - mat1_roi.at<signed short>(minLoc));
-            error1 = ::abs(mat1_roi.at<signed short>(maxLoc_) - mat1_roi.at<signed short>(maxLoc));
+            minlocVal = src1_roi.at<signed short>(minLoc);
+            minlocVal_ = src1_roi.at<signed short>(minLoc_);
+            maxlocVal = src1_roi.at<signed short>(maxLoc);
+            maxlocVal_ = src1_roi.at<signed short>(maxLoc_);
+            error0 = ::abs(src1_roi.at<signed short>(minLoc_) - src1_roi.at<signed short>(minLoc));
+            error1 = ::abs(src1_roi.at<signed short>(maxLoc_) - src1_roi.at<signed short>(maxLoc));
         }
-        if(depth == 4)
+        if (depth == 4)
         {
-            minlocVal = mat1_roi.at<int>(minLoc);
-            minlocVal_ = mat1_roi.at<int>(minLoc_);
-            maxlocVal = mat1_roi.at<int>(maxLoc);
-            maxlocVal_ = mat1_roi.at<int>(maxLoc_);
-            error0 = ::abs(mat1_roi.at<int>(minLoc_) - mat1_roi.at<int>(minLoc));
-            error1 = ::abs(mat1_roi.at<int>(maxLoc_) - mat1_roi.at<int>(maxLoc));
+            minlocVal = src1_roi.at<int>(minLoc);
+            minlocVal_ = src1_roi.at<int>(minLoc_);
+            maxlocVal = src1_roi.at<int>(maxLoc);
+            maxlocVal_ = src1_roi.at<int>(maxLoc_);
+            error0 = ::abs(src1_roi.at<int>(minLoc_) - src1_roi.at<int>(minLoc));
+            error1 = ::abs(src1_roi.at<int>(maxLoc_) - src1_roi.at<int>(maxLoc));
         }
-        if(depth == 5)
+        if (depth == 5)
         {
-            minlocVal = mat1_roi.at<float>(minLoc);
-            minlocVal_ = mat1_roi.at<float>(minLoc_);
-            maxlocVal = mat1_roi.at<float>(maxLoc);
-            maxlocVal_ = mat1_roi.at<float>(maxLoc_);
-            error0 = ::abs(mat1_roi.at<float>(minLoc_) - mat1_roi.at<float>(minLoc));
-            error1 = ::abs(mat1_roi.at<float>(maxLoc_) - mat1_roi.at<float>(maxLoc));
+            minlocVal = src1_roi.at<float>(minLoc);
+            minlocVal_ = src1_roi.at<float>(minLoc_);
+            maxlocVal = src1_roi.at<float>(maxLoc);
+            maxlocVal_ = src1_roi.at<float>(maxLoc_);
+            error0 = ::abs(src1_roi.at<float>(minLoc_) - src1_roi.at<float>(minLoc));
+            error1 = ::abs(src1_roi.at<float>(maxLoc_) - src1_roi.at<float>(maxLoc));
         }
-        if(depth == 6)
+        if (depth == 6)
         {
-            minlocVal = mat1_roi.at<double>(minLoc);
-            minlocVal_ = mat1_roi.at<double>(minLoc_);
-            maxlocVal = mat1_roi.at<double>(maxLoc);
-            maxlocVal_ = mat1_roi.at<double>(maxLoc_);
-            error0 = ::abs(mat1_roi.at<double>(minLoc_) - mat1_roi.at<double>(minLoc));
-            error1 = ::abs(mat1_roi.at<double>(maxLoc_) - mat1_roi.at<double>(maxLoc));
+            minlocVal = src1_roi.at<double>(minLoc);
+            minlocVal_ = src1_roi.at<double>(minLoc_);
+            maxlocVal = src1_roi.at<double>(maxLoc);
+            maxlocVal_ = src1_roi.at<double>(maxLoc_);
+            error0 = ::abs(src1_roi.at<double>(minLoc_) - src1_roi.at<double>(minLoc));
+            error1 = ::abs(src1_roi.at<double>(maxLoc_) - src1_roi.at<double>(maxLoc));
         }
 
         EXPECT_DOUBLE_EQ(minVal_, minVal);
@@ -756,27 +904,24 @@ TEST_P(MinMaxLoc, MAT)
     }
 }
 
-
 TEST_P(MinMaxLoc, MASK)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
         double minVal, maxVal;
         cv::Point minLoc, maxLoc;
-        int depth = mat1.depth();
+        int depth = src1.depth();
         if (depth != CV_8S)
-        {
-            cv::minMaxLoc(mat1_roi, &minVal, &maxVal, &minLoc, &maxLoc, mask_roi);
-        }
+            cv::minMaxLoc(src1_roi, &minVal, &maxVal, &minLoc, &maxLoc, mask_roi);
         else
         {
             minVal = std::numeric_limits<double>::max();
             maxVal = -std::numeric_limits<double>::max();
-            for (int i = 0; i < mat1_roi.rows; ++i)
-                for (int j = 0; j < mat1_roi.cols; ++j)
+            for (int i = 0; i < src1_roi.rows; ++i)
+                for (int j = 0; j < src1_roi.cols; ++j)
                 {
-                    signed char val = mat1_roi.at<signed char>(i, j);
+                    signed char val = src1_roi.at<signed char>(i, j);
                     unsigned char m = mask_roi.at<unsigned char>(i , j);
                     if (val < minVal && m)
                     {
@@ -795,72 +940,72 @@ TEST_P(MinMaxLoc, MASK)
 
         double minVal_, maxVal_;
         cv::Point minLoc_, maxLoc_;
-        cv::ocl::minMaxLoc(gmat1, &minVal_, &maxVal_, &minLoc_, &maxLoc_, gmask);
+        cv::ocl::minMaxLoc(gsrc1, &minVal_, &maxVal_, &minLoc_, &maxLoc_, gmask);
 
         double error0 = 0., error1 = 0., minlocVal = 0., minlocVal_ = 0., maxlocVal = 0., maxlocVal_ = 0.;
-        if(minLoc_.x == -1 || minLoc_.y == -1 || maxLoc_.x == -1 || maxLoc_.y == -1) continue;
-        if(depth == 0)
+        if (minLoc_.x == -1 || minLoc_.y == -1 || maxLoc_.x == -1 || maxLoc_.y == -1) continue;
+        if (depth == 0)
         {
-            minlocVal = mat1_roi.at<unsigned char>(minLoc);
-            minlocVal_ = mat1_roi.at<unsigned char>(minLoc_);
-            maxlocVal = mat1_roi.at<unsigned char>(maxLoc);
-            maxlocVal_ = mat1_roi.at<unsigned char>(maxLoc_);
-            error0 = ::abs(mat1_roi.at<unsigned char>(minLoc_) - mat1_roi.at<unsigned char>(minLoc));
-            error1 = ::abs(mat1_roi.at<unsigned char>(maxLoc_) - mat1_roi.at<unsigned char>(maxLoc));
+            minlocVal = src1_roi.at<unsigned char>(minLoc);
+            minlocVal_ = src1_roi.at<unsigned char>(minLoc_);
+            maxlocVal = src1_roi.at<unsigned char>(maxLoc);
+            maxlocVal_ = src1_roi.at<unsigned char>(maxLoc_);
+            error0 = ::abs(src1_roi.at<unsigned char>(minLoc_) - src1_roi.at<unsigned char>(minLoc));
+            error1 = ::abs(src1_roi.at<unsigned char>(maxLoc_) - src1_roi.at<unsigned char>(maxLoc));
         }
-        if(depth == 1)
+        if (depth == 1)
         {
-            minlocVal = mat1_roi.at<signed char>(minLoc);
-            minlocVal_ = mat1_roi.at<signed char>(minLoc_);
-            maxlocVal = mat1_roi.at<signed char>(maxLoc);
-            maxlocVal_ = mat1_roi.at<signed char>(maxLoc_);
-            error0 = ::abs(mat1_roi.at<signed char>(minLoc_) - mat1_roi.at<signed char>(minLoc));
-            error1 = ::abs(mat1_roi.at<signed char>(maxLoc_) - mat1_roi.at<signed char>(maxLoc));
+            minlocVal = src1_roi.at<signed char>(minLoc);
+            minlocVal_ = src1_roi.at<signed char>(minLoc_);
+            maxlocVal = src1_roi.at<signed char>(maxLoc);
+            maxlocVal_ = src1_roi.at<signed char>(maxLoc_);
+            error0 = ::abs(src1_roi.at<signed char>(minLoc_) - src1_roi.at<signed char>(minLoc));
+            error1 = ::abs(src1_roi.at<signed char>(maxLoc_) - src1_roi.at<signed char>(maxLoc));
         }
-        if(depth == 2)
+        if (depth == 2)
         {
-            minlocVal = mat1_roi.at<unsigned short>(minLoc);
-            minlocVal_ = mat1_roi.at<unsigned short>(minLoc_);
-            maxlocVal = mat1_roi.at<unsigned short>(maxLoc);
-            maxlocVal_ = mat1_roi.at<unsigned short>(maxLoc_);
-            error0 = ::abs(mat1_roi.at<unsigned short>(minLoc_) - mat1_roi.at<unsigned short>(minLoc));
-            error1 = ::abs(mat1_roi.at<unsigned short>(maxLoc_) - mat1_roi.at<unsigned short>(maxLoc));
+            minlocVal = src1_roi.at<unsigned short>(minLoc);
+            minlocVal_ = src1_roi.at<unsigned short>(minLoc_);
+            maxlocVal = src1_roi.at<unsigned short>(maxLoc);
+            maxlocVal_ = src1_roi.at<unsigned short>(maxLoc_);
+            error0 = ::abs(src1_roi.at<unsigned short>(minLoc_) - src1_roi.at<unsigned short>(minLoc));
+            error1 = ::abs(src1_roi.at<unsigned short>(maxLoc_) - src1_roi.at<unsigned short>(maxLoc));
         }
-        if(depth == 3)
+        if (depth == 3)
         {
-            minlocVal = mat1_roi.at<signed short>(minLoc);
-            minlocVal_ = mat1_roi.at<signed short>(minLoc_);
-            maxlocVal = mat1_roi.at<signed short>(maxLoc);
-            maxlocVal_ = mat1_roi.at<signed short>(maxLoc_);
-            error0 = ::abs(mat1_roi.at<signed short>(minLoc_) - mat1_roi.at<signed short>(minLoc));
-            error1 = ::abs(mat1_roi.at<signed short>(maxLoc_) - mat1_roi.at<signed short>(maxLoc));
+            minlocVal = src1_roi.at<signed short>(minLoc);
+            minlocVal_ = src1_roi.at<signed short>(minLoc_);
+            maxlocVal = src1_roi.at<signed short>(maxLoc);
+            maxlocVal_ = src1_roi.at<signed short>(maxLoc_);
+            error0 = ::abs(src1_roi.at<signed short>(minLoc_) - src1_roi.at<signed short>(minLoc));
+            error1 = ::abs(src1_roi.at<signed short>(maxLoc_) - src1_roi.at<signed short>(maxLoc));
         }
-        if(depth == 4)
+        if (depth == 4)
         {
-            minlocVal = mat1_roi.at<int>(minLoc);
-            minlocVal_ = mat1_roi.at<int>(minLoc_);
-            maxlocVal = mat1_roi.at<int>(maxLoc);
-            maxlocVal_ = mat1_roi.at<int>(maxLoc_);
-            error0 = ::abs(mat1_roi.at<int>(minLoc_) - mat1_roi.at<int>(minLoc));
-            error1 = ::abs(mat1_roi.at<int>(maxLoc_) - mat1_roi.at<int>(maxLoc));
+            minlocVal = src1_roi.at<int>(minLoc);
+            minlocVal_ = src1_roi.at<int>(minLoc_);
+            maxlocVal = src1_roi.at<int>(maxLoc);
+            maxlocVal_ = src1_roi.at<int>(maxLoc_);
+            error0 = ::abs(src1_roi.at<int>(minLoc_) - src1_roi.at<int>(minLoc));
+            error1 = ::abs(src1_roi.at<int>(maxLoc_) - src1_roi.at<int>(maxLoc));
         }
-        if(depth == 5)
+        if (depth == 5)
         {
-            minlocVal = mat1_roi.at<float>(minLoc);
-            minlocVal_ = mat1_roi.at<float>(minLoc_);
-            maxlocVal = mat1_roi.at<float>(maxLoc);
-            maxlocVal_ = mat1_roi.at<float>(maxLoc_);
-            error0 = ::abs(mat1_roi.at<float>(minLoc_) - mat1_roi.at<float>(minLoc));
-            error1 = ::abs(mat1_roi.at<float>(maxLoc_) - mat1_roi.at<float>(maxLoc));
+            minlocVal = src1_roi.at<float>(minLoc);
+            minlocVal_ = src1_roi.at<float>(minLoc_);
+            maxlocVal = src1_roi.at<float>(maxLoc);
+            maxlocVal_ = src1_roi.at<float>(maxLoc_);
+            error0 = ::abs(src1_roi.at<float>(minLoc_) - src1_roi.at<float>(minLoc));
+            error1 = ::abs(src1_roi.at<float>(maxLoc_) - src1_roi.at<float>(maxLoc));
         }
-        if(depth == 6)
+        if (depth == 6)
         {
-            minlocVal = mat1_roi.at<double>(minLoc);
-            minlocVal_ = mat1_roi.at<double>(minLoc_);
-            maxlocVal = mat1_roi.at<double>(maxLoc);
-            maxlocVal_ = mat1_roi.at<double>(maxLoc_);
-            error0 = ::abs(mat1_roi.at<double>(minLoc_) - mat1_roi.at<double>(minLoc));
-            error1 = ::abs(mat1_roi.at<double>(maxLoc_) - mat1_roi.at<double>(maxLoc));
+            minlocVal = src1_roi.at<double>(minLoc);
+            minlocVal_ = src1_roi.at<double>(minLoc_);
+            maxlocVal = src1_roi.at<double>(maxLoc);
+            maxlocVal_ = src1_roi.at<double>(maxLoc_);
+            error0 = ::abs(src1_roi.at<double>(minLoc_) - src1_roi.at<double>(minLoc));
+            error1 = ::abs(src1_roi.at<double>(maxLoc_) - src1_roi.at<double>(maxLoc));
         }
 
         EXPECT_DOUBLE_EQ(minVal_, minVal);
@@ -873,16 +1018,18 @@ TEST_P(MinMaxLoc, MASK)
     }
 }
 
+//////////////////////////////// Sum /////////////////////////////////////////////////
 
-struct Sum : ArithmTestBase {};
+typedef ArithmTestBase Sum;
 
-TEST_P(Sum, MAT)
+TEST_P(Sum, DISABLED_MAT)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
-        Scalar cpures = cv::sum(mat1_roi);
-        Scalar gpures = cv::ocl::sum(gmat1);
+
+        Scalar cpures = cv::sum(src1_roi);
+        Scalar gpures = cv::ocl::sum(gsrc1);
 
         //check results
         EXPECT_NEAR(cpures[0], gpures[0], 0.1);
@@ -892,398 +1039,294 @@ TEST_P(Sum, MAT)
     }
 }
 
+//////////////////////////////// CountNonZero /////////////////////////////////////////////////
 
-struct CountNonZero : ArithmTestBase {};
+typedef ArithmTestBase CountNonZero;
 
 TEST_P(CountNonZero, MAT)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
-        int cpures = cv::countNonZero(mat1_roi);
-        int gpures = cv::ocl::countNonZero(gmat1);
+        int cpures = cv::countNonZero(src1_roi);
+        int gpures = cv::ocl::countNonZero(gsrc1);
 
         EXPECT_DOUBLE_EQ((double)cpures, (double)gpures);
     }
 }
 
+//////////////////////////////// Phase /////////////////////////////////////////////////
 
+typedef ArithmTestBase Phase;
 
-////////////////////////////////phase/////////////////////////////////////////////////
-struct Phase : ArithmTestBase {};
-
-TEST_P(Phase, Mat)
+TEST_P(Phase, DISABLED_Mat)
 {
-    if(mat1.depth() != CV_32F && mat1.depth() != CV_64F)
+    for (int angelInDegrees = 0; angelInDegrees < 2; angelInDegrees++)
     {
-        cout << "\tUnsupported type\t\n";
-    }
-    for(int angelInDegrees = 0; angelInDegrees < 2; angelInDegrees++)
-    {
-        for(int j = 0; j < LOOP_TIMES; j++)
+        for (int j = 0; j < LOOP_TIMES; j++)
         {
             random_roi();
-            cv::phase(mat1_roi, mat2_roi, dst_roi, angelInDegrees ? true : false);
-            cv::ocl::phase(gmat1, gmat2, gdst, angelInDegrees ? true : false);
+            cv::phase(src1_roi, src2_roi, dst1_roi, angelInDegrees ? true : false);
+            cv::ocl::phase(gsrc1, gsrc2, gdst1, angelInDegrees ? true : false);
             Near(1e-2);
         }
     }
 }
 
+//////////////////////////////// Bitwise_and /////////////////////////////////////////////////
 
-////////////////////////////////bitwise_and/////////////////////////////////////////////////
-struct Bitwise_and : ArithmTestBase {};
+typedef ArithmTestBase Bitwise_and;
 
 TEST_P(Bitwise_and, Mat)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::bitwise_and(mat1_roi, mat2_roi, dst_roi);
-        cv::ocl::bitwise_and(gmat1, gmat2, gdst);
+        cv::bitwise_and(src1_roi, src2_roi, dst1_roi);
+        cv::ocl::bitwise_and(gsrc1, gsrc2, gdst1);
         Near(0);
     }
 }
 
 TEST_P(Bitwise_and, Mat_Mask)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::bitwise_and(mat1_roi, mat2_roi, dst_roi, mask_roi);
-        cv::ocl::bitwise_and(gmat1, gmat2, gdst, gmask);
+        cv::bitwise_and(src1_roi, src2_roi, dst1_roi, mask_roi);
+        cv::ocl::bitwise_and(gsrc1, gsrc2, gdst1, gmask);
         Near(0);
     }
 }
 
 TEST_P(Bitwise_and, Scalar)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::bitwise_and(mat1_roi, val, dst_roi);
-        cv::ocl::bitwise_and(gmat1, val, gdst);
+        cv::bitwise_and(src1_roi, val, dst1_roi);
+        cv::ocl::bitwise_and(gsrc1, val, gdst1);
         Near(1e-5);
     }
 }
 
 TEST_P(Bitwise_and, Scalar_Mask)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::bitwise_and(mat1_roi, val, dst_roi, mask_roi);
-        cv::ocl::bitwise_and(gmat1, val, gdst, gmask);
+        cv::bitwise_and(src1_roi, val, dst1_roi, mask_roi);
+        cv::ocl::bitwise_and(gsrc1, val, gdst1, gmask);
         Near(1e-5);
     }
 }
 
+//////////////////////////////// Bitwise_or /////////////////////////////////////////////////
 
-
-////////////////////////////////bitwise_or/////////////////////////////////////////////////
-
-struct Bitwise_or : ArithmTestBase {};
+typedef ArithmTestBase Bitwise_or;
 
 TEST_P(Bitwise_or, Mat)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::bitwise_or(mat1_roi, mat2_roi, dst_roi);
-        cv::ocl::bitwise_or(gmat1, gmat2, gdst);
+        cv::bitwise_or(src1_roi, src2_roi, dst1_roi);
+        cv::ocl::bitwise_or(gsrc1, gsrc2, gdst1);
         Near(0);
     }
 }
 
 TEST_P(Bitwise_or, Mat_Mask)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::bitwise_or(mat1_roi, mat2_roi, dst_roi, mask_roi);
-        cv::ocl::bitwise_or(gmat1, gmat2, gdst, gmask);
+        cv::bitwise_or(src1_roi, src2_roi, dst1_roi, mask_roi);
+        cv::ocl::bitwise_or(gsrc1, gsrc2, gdst1, gmask);
         Near(0);
     }
 }
 
 TEST_P(Bitwise_or, Scalar)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::bitwise_or(mat1_roi, val, dst_roi);
-        cv::ocl::bitwise_or(gmat1, val, gdst);
+        cv::bitwise_or(src1_roi, val, dst1_roi);
+        cv::ocl::bitwise_or(gsrc1, val, gdst1);
         Near(1e-5);
     }
 }
 
 TEST_P(Bitwise_or, Scalar_Mask)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::bitwise_or(mat1_roi, val, dst_roi, mask_roi);
-        cv::ocl::bitwise_or(gmat1, val, gdst, gmask);
+        cv::bitwise_or(src1_roi, val, dst1_roi, mask_roi);
+        cv::ocl::bitwise_or(gsrc1, val, gdst1, gmask);
         Near(1e-5);
     }
 }
 
+//////////////////////////////// Bitwise_xor /////////////////////////////////////////////////
 
-
-////////////////////////////////bitwise_xor/////////////////////////////////////////////////
-
-struct Bitwise_xor : ArithmTestBase {};
+typedef ArithmTestBase Bitwise_xor;
 
 TEST_P(Bitwise_xor, Mat)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::bitwise_xor(mat1_roi, mat2_roi, dst_roi);
-        cv::ocl::bitwise_xor(gmat1, gmat2, gdst);
+        cv::bitwise_xor(src1_roi, src2_roi, dst1_roi);
+        cv::ocl::bitwise_xor(gsrc1, gsrc2, gdst1);
         Near(0);
     }
 }
 
 TEST_P(Bitwise_xor, Mat_Mask)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::bitwise_xor(mat1_roi, mat2_roi, dst_roi, mask_roi);
-        cv::ocl::bitwise_xor(gmat1, gmat2, gdst, gmask);
+        cv::bitwise_xor(src1_roi, src2_roi, dst1_roi, mask_roi);
+        cv::ocl::bitwise_xor(gsrc1, gsrc2, gdst1, gmask);
         Near(0);
     }
 }
 
 TEST_P(Bitwise_xor, Scalar)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::bitwise_xor(mat1_roi, val, dst_roi);
-        cv::ocl::bitwise_xor(gmat1, val, gdst);
+        cv::bitwise_xor(src1_roi, val, dst1_roi);
+        cv::ocl::bitwise_xor(gsrc1, val, gdst1);
         Near(1e-5);
     }
 }
 
 TEST_P(Bitwise_xor, Scalar_Mask)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::bitwise_xor(mat1_roi, val, dst_roi, mask_roi);
-        cv::ocl::bitwise_xor(gmat1, val, gdst, gmask);
+        cv::bitwise_xor(src1_roi, val, dst1_roi, mask_roi);
+        cv::ocl::bitwise_xor(gsrc1, val, gdst1, gmask);
         Near(1e-5);
     }
 }
 
+//////////////////////////////// Bitwise_not /////////////////////////////////////////////////
 
-////////////////////////////////bitwise_not/////////////////////////////////////////////////
-
-struct Bitwise_not : ArithmTestBase {};
+typedef ArithmTestBase Bitwise_not;
 
 TEST_P(Bitwise_not, Mat)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::bitwise_not(mat1_roi, dst_roi);
-        cv::ocl::bitwise_not(gmat1, gdst);
+        cv::bitwise_not(src1_roi, dst1_roi);
+        cv::ocl::bitwise_not(gsrc1, gdst1);
         Near(0);
     }
 }
 
+//////////////////////////////// Compare /////////////////////////////////////////////////
 
-////////////////////////////////compare/////////////////////////////////////////////////
-struct Compare : ArithmTestBase {};
+typedef ArithmTestBase Compare;
 
 TEST_P(Compare, Mat)
 {
-    if(mat1.type() == CV_8SC1)
-        //if(mat1.type() != CV_8UC1 || mat1.type()!= CV_16UC1 || mat1.type()!= CV_16SC1|| mat1.type()!= CV_32SC1 || mat1.type()!= CV_32FC1|| mat1.type()!= CV_64FC1)
-    {
-        cout << "\tUnsupported type\t\n";
-    }
-
-    int cmp_codes[] = {CMP_EQ, CMP_GT, CMP_GE, CMP_LT, CMP_LE, CMP_NE};
-    //const char *cmp_str[] = {"CMP_EQ", "CMP_GT", "CMP_GE", "CMP_LT", "CMP_LE", "CMP_NE"};
+    int cmp_codes[] = { CMP_EQ, CMP_GT, CMP_GE, CMP_LT, CMP_LE, CMP_NE };
     int cmp_num = sizeof(cmp_codes) / sizeof(int);
 
     for (int i = 0; i < cmp_num; ++i)
-    {
-
-        for(int j = 0; j < LOOP_TIMES; j++)
+        for (int j = 0; j < LOOP_TIMES; j++)
         {
             random_roi();
 
-            cv::compare(mat1_roi, mat2_roi, dst_roi, cmp_codes[i]);
-            cv::ocl::compare(gmat1, gmat2, gdst, cmp_codes[i]);
+            cv::compare(src1_roi, src2_roi, dst1_roi, cmp_codes[i]);
+            cv::ocl::compare(gsrc1, gsrc2, gdst1, cmp_codes[i]);
+
             Near(0);
         }
-    }
-
 }
 
+//////////////////////////////// Pow /////////////////////////////////////////////////
 
-struct Pow : ArithmTestBase {};
+typedef ArithmTestBase Pow;
 
 TEST_P(Pow, Mat)
 {
-    if(mat1.depth() != CV_32F && mat1.depth() != CV_64F)
-    {
-        cout << "\tUnsupported type\t\n";
-    }
-
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
         double p = 4.5;
-        cv::pow(mat1_roi, p, dst_roi);
-        cv::ocl::pow(gmat1, p, gdst);
+        cv::pow(src1_roi, p, dst1_roi);
+        cv::ocl::pow(gsrc1, p, gdst1);
         Near(1);
     }
 }
 
+//////////////////////////////// AddWeighted /////////////////////////////////////////////////
 
-struct AddWeighted : ArithmTestBase {};
+typedef ArithmTestBase AddWeighted;
 
 TEST_P(AddWeighted, Mat)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
-        double alpha = 2.0, beta = 1.0, gama = 3.0;
 
+        const double alpha = 2.0, beta = 1.0, gama = 3.0;
 
-        cv::addWeighted(mat1_roi, alpha, mat2_roi, beta, gama, dst_roi);
-
-        //	cv::ocl::oclMat clmat1(mat1),clmat2(mat2),cldst;
-
-        cv::ocl::addWeighted(gmat1, alpha, gmat2, beta, gama, gdst);
+        cv::addWeighted(src1_roi, alpha, src2_roi, beta, gama, dst1_roi);
+        cv::ocl::addWeighted(gsrc1, alpha, gsrc2, beta, gama, gdst1);
 
         Near(1e-5);
     }
 }
 
+//////////////////////////////////////// Instantiation /////////////////////////////////////////
 
-
-
-//********test****************
-
-INSTANTIATE_TEST_CASE_P(Arithm, Lut, Combine(
-                            Values(CV_8UC1, CV_8UC3, CV_8UC4),
-                            Values(false))); // Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(Arithm, Exp, Combine(
-                            Values(CV_32FC1, CV_32FC1),
-                            Values(false))); // Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(Arithm, Log, Combine(
-                            Values(CV_32FC1, CV_32FC1),
-                            Values(false))); // Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(Arithm, Add, Combine(
-                            Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32SC3, CV_32SC4, CV_32FC1, CV_32FC3, CV_32FC4),
-                            Values(false)));
-
-INSTANTIATE_TEST_CASE_P(Arithm, Sub, Combine(
-                            Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32SC3, CV_32SC4, CV_32FC1, CV_32FC3, CV_32FC4),
-                            Values(false)));
-
-INSTANTIATE_TEST_CASE_P(Arithm, Mul, Combine(
-                            Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32SC3, CV_32SC4, CV_32FC1, CV_32FC3, CV_32FC4),
-                            Values(false))); // Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(Arithm, Div, Combine(
-                            Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32SC3, CV_32SC4, CV_32FC1, CV_32FC3, CV_32FC4),
-                            Values(false))); // Values(false) is the reserved parameter
-
-
-INSTANTIATE_TEST_CASE_P(Arithm, Absdiff, Combine(
-                            Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32SC3, CV_32SC4, CV_32FC1, CV_32FC3, CV_32FC4),
-                            Values(false))); // Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(Arithm, CartToPolar, Combine(
-                            Values(CV_32FC1, CV_32FC3, CV_32FC4),
-                            Values(false))); // Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(Arithm, PolarToCart, Combine(
-                            Values(CV_32FC1, CV_32FC3, CV_32FC4),
-                            Values(false))); // Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(Arithm, Magnitude, Combine(
-                            Values(CV_32FC1, CV_32FC3, CV_32FC4),
-                            Values(false))); // Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(Arithm, Transpose, Combine(
-                            Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32FC1),
-                            Values(false))); // Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(Arithm, Flip, Combine(
-                            Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32SC3, CV_32SC4, CV_32FC1, CV_32FC3, CV_32FC4),
-                            Values(false))); // Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(Arithm, MinMax, Combine(
-                            Values(CV_8UC1, CV_32SC1, CV_32FC1),
-                            Values(false)));
-
-INSTANTIATE_TEST_CASE_P(Arithm, MinMaxLoc, Combine(
-                            Values(CV_8UC1, CV_32SC1, CV_32FC1),
-                            Values(false)));
-
-INSTANTIATE_TEST_CASE_P(Arithm, Sum, Combine(
-                            Values(CV_8U, CV_32S, CV_32F),
-                            Values(false)));
-
-INSTANTIATE_TEST_CASE_P(Arithm, CountNonZero, Combine(
-                            Values(CV_8U, CV_32S, CV_32F),
-                            Values(false)));
-
-
-INSTANTIATE_TEST_CASE_P(Arithm, Phase, Combine(Values(CV_32FC1, CV_32FC3, CV_32FC4), Values(false)));
-// Values(false) is the reserved parameter
-
-
-INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_and, Combine(
-                            Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32SC3, CV_32SC4, CV_32FC1, CV_32FC3, CV_32FC4), Values(false)));
-//Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_or, Combine(
-                            Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32FC1, CV_32FC3, CV_32FC4), Values(false)));
-//Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_xor, Combine(
-                            Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32FC1, CV_32FC3, CV_32FC4), Values(false)));
-//Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_not, Combine(
-                            Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32FC1, CV_32FC3, CV_32FC4), Values(false)));
-//Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(Arithm, Compare, Combine(Values(CV_8UC1, CV_32SC1, CV_32FC1), Values(false)));
-// Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(Arithm, Pow, Combine(Values(CV_32FC1, CV_32FC3, CV_32FC4), Values(false)));
-// Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(Arithm, AddWeighted, Combine(
-                            Values(CV_8UC1, CV_32SC1, CV_32FC1),
-                            Values(false))); // Values(false) is the reserved parameter
-
-
+INSTANTIATE_TEST_CASE_P(Arithm, Lut, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool(), Bool())); // +
+INSTANTIATE_TEST_CASE_P(Arithm, Exp, Combine(testing::Values(CV_32F, CV_64F), testing::Range(1, 5), Bool())); // +
+INSTANTIATE_TEST_CASE_P(Arithm, Log, Combine(testing::Values(CV_32F, CV_64F), testing::Range(1, 5), Bool())); // +
+INSTANTIATE_TEST_CASE_P(Arithm, Add, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool())); // +
+INSTANTIATE_TEST_CASE_P(Arithm, Sub, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool())); // +
+INSTANTIATE_TEST_CASE_P(Arithm, Mul, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool()));
+INSTANTIATE_TEST_CASE_P(Arithm, Div, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool()));
+INSTANTIATE_TEST_CASE_P(Arithm, Absdiff, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool())); // +
+INSTANTIATE_TEST_CASE_P(Arithm, CartToPolar, Combine(Values(CV_32F, CV_64F), testing::Range(1, 5), Bool())); // +
+INSTANTIATE_TEST_CASE_P(Arithm, PolarToCart, Combine(Values(CV_32F, CV_64F), testing::Range(1, 5), Bool())); // +
+INSTANTIATE_TEST_CASE_P(Arithm, Magnitude, Combine(Values(CV_32F, CV_64F), testing::Range(1, 5), Bool())); // +
+INSTANTIATE_TEST_CASE_P(Arithm, Transpose, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool())); // +
+INSTANTIATE_TEST_CASE_P(Arithm, Flip, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool())); // +
+INSTANTIATE_TEST_CASE_P(Arithm, MinMax, Combine(testing::Range(CV_8U, CV_USRTYPE1), Values(1), Bool()));
+INSTANTIATE_TEST_CASE_P(Arithm, MinMaxLoc, Combine(testing::Range(CV_8U, CV_USRTYPE1), Values(1), Bool())); // +
+INSTANTIATE_TEST_CASE_P(Arithm, Sum, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool()));
+INSTANTIATE_TEST_CASE_P(Arithm, CountNonZero, Combine(testing::Range(CV_8U, CV_USRTYPE1), Values(1), Bool())); // +
+INSTANTIATE_TEST_CASE_P(Arithm, Phase, Combine(Values(CV_32F, CV_64F), testing::Range(1, 5), Bool())); // +
+INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_and, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool())); // +
+INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_or, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool())); // +
+INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_xor, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool())); // +
+INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_not, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool())); // +
+INSTANTIATE_TEST_CASE_P(Arithm, Compare, Combine(testing::Range(CV_8U, CV_USRTYPE1), Values(1), Bool())); // +
+INSTANTIATE_TEST_CASE_P(Arithm, Pow, Combine(Values(CV_32F, CV_64F), testing::Range(1, 5), Bool())); // +
+INSTANTIATE_TEST_CASE_P(Arithm, AddWeighted, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool())); // +
 
 #endif // HAVE_OPENCL

From c87d2d414d253385d77652171b13e6ad2bcae3cb Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Tue, 24 Sep 2013 14:18:42 +0400
Subject: [PATCH 13/17] removed arithm_absdiff.cl

---
 modules/ocl/src/opencl/arithm_absdiff.cl | 970 -----------------------
 1 file changed, 970 deletions(-)
 delete mode 100644 modules/ocl/src/opencl/arithm_absdiff.cl

diff --git a/modules/ocl/src/opencl/arithm_absdiff.cl b/modules/ocl/src/opencl/arithm_absdiff.cl
deleted file mode 100644
index 341a0048f..000000000
--- a/modules/ocl/src/opencl/arithm_absdiff.cl
+++ /dev/null
@@ -1,970 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other oclMaterials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if defined (DOUBLE_SUPPORT)
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#elif defined (cl_amd_fp64)
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#endif
-#endif
-
-//////////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////////////absdiff////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-/**************************************adddiff *************************************/
-__kernel void arithm_absdiff_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                 __global uchar *src2, int src2_step, int src2_offset,
-                                 __global uchar *dst,  int dst_step,  int dst_offset,
-                                 int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
-        uchar4 src2_data = vload4(0, src2 + src2_index_fix);
-        if(src1_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = abs_diff(src1_data, src2_data);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-__kernel void arithm_absdiff_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                 __global ushort *src2, int src2_step, int src2_offset,
-                                 __global ushort *dst,  int dst_step,  int dst_offset,
-                                 int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
-
-        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
-
-        ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
-        ushort4 tmp_data = abs_diff(src1_data, src2_data);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global ushort4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-}
-__kernel void arithm_absdiff_D3 (__global short *src1, int src1_step, int src1_offset,
-                                 __global short *src2, int src2_step, int src2_offset,
-                                 __global short *dst,  int dst_step,  int dst_offset,
-                                 int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
-
-        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
-
-        short4  dst_data = *((__global short4 *)((__global char *)dst + dst_index));
-        ushort4 tmp = abs_diff(src1_data, src2_data);
-        short4  tmp_data = convert_short4_sat(tmp);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-}
-
-__kernel void arithm_absdiff_D4 (__global int *src1, int src1_step, int src1_offset,
-                                 __global int *src2, int src2_step, int src2_offset,
-                                 __global int *dst,  int dst_step,  int dst_offset,
-                                 int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        int data1 = *((__global int *)((__global char *)src1 + src1_index));
-        int data2 = *((__global int *)((__global char *)src2 + src2_index));
-        uint tmp = abs_diff(data1, data2);
-        int  tmp_data = convert_int_sat(tmp);
-
-        *((__global int *)((__global char *)dst + dst_index)) = tmp_data;
-    }
-}
-__kernel void arithm_absdiff_D5 (__global float *src1, int src1_step, int src1_offset,
-                                 __global float *src2, int src2_step, int src2_offset,
-                                 __global float *dst,  int dst_step,  int dst_offset,
-                                 int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        float data1 = *((__global float *)((__global char *)src1 + src1_index));
-        float data2 = *((__global float *)((__global char *)src2 + src2_index));
-        float tmp = fabs(data1 - data2);
-
-        *((__global float *)((__global char *)dst + dst_index)) = tmp;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_absdiff_D6 (__global double *src1, int src1_step, int src1_offset,
-                                 __global double *src2, int src2_step, int src2_offset,
-                                 __global double *dst,  int dst_step,  int dst_offset,
-                                 int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        double data1 = *((__global double *)((__global char *)src1 + src1_index));
-        double data2 = *((__global double *)((__global char *)src2 + src2_index));
-        double tmp = fabs(data1-data2);
-
-        *((__global double *)((__global char *)dst + dst_index)) = tmp;
-    }
-}
-#endif
-
-/**************************************absdiff with scalar**************************************/
-__kernel void arithm_s_absdiff_C1_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                      __global   uchar *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
-        int4 src2_data = (int4)(src2.x, src2.x, src2.x, src2.x);
-        if(src1_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-
-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = convert_uchar4_sat(abs_diff(convert_int4_sat(src1_data), src2_data));
-
-        data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
-        data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
-        data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
-        data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_absdiff_C1_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                      __global   ushort *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index));
-        int2 src2_data = (int2)(src2.x, src2.x);
-
-        ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
-        ushort2 tmp_data = convert_ushort2_sat(abs_diff(convert_int2_sat(src1_data), src2_data));
-
-        data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x;
-        data.y = (dst_index + 2 <  dst_end  ) ? tmp_data.y : data.y;
-
-        *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_absdiff_C1_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                      __global   short *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index));
-        int2 src2_data = (int2)(src2.x, src2.x);
-        short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
-
-        ushort2 tmp = convert_ushort2_sat(abs_diff(convert_int2_sat(src1_data), src2_data));
-        short2 tmp_data = convert_short2_sat(tmp);
-
-        data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x;
-        data.y = (dst_index + 2 <  dst_end  ) ? tmp_data.y : data.y;
-
-        *((__global short2 *)((__global uchar *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_absdiff_C1_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                      __global   int *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        int src_data1 = *((__global int *)((__global char *)src1 + src1_index));
-        int src_data2 = src2.x;
-        int dst_data  = *((__global int *)((__global char *)dst  + dst_index));
-
-        uint tmp_data = abs_diff(src_data1, src_data2);
-        int  data = convert_int_sat(tmp_data);
-
-        *((__global int *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_absdiff_C1_D5 (__global   float *src1, int src1_step, int src1_offset,
-                                      __global   float *dst,  int dst_step,  int dst_offset,
-                                      float4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        float src_data1 = *((__global float *)((__global char *)src1 + src1_index));
-        float src_data2 = src2.x;
-        float dst_data  = *((__global float *)((__global char *)dst  + dst_index));
-
-        float data = fabs(src_data1 - src_data2);
-
-        *((__global float *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_absdiff_C1_D6 (__global   double *src1, int src1_step, int src1_offset,
-                                      __global   double *dst,  int dst_step,  int dst_offset,
-                                      double4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        double src_data1 = *((__global double *)((__global char *)src1 + src1_index));
-        double src2_data = src2.x;
-        double dst_data  = *((__global double *)((__global char *)dst  + dst_index));
-
-        double data = fabs(src_data1 - src2_data);
-
-        *((__global double *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#endif
-
-__kernel void arithm_s_absdiff_C2_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                      __global   uchar *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        uchar4 src1_data = vload4(0, src1 + src1_index);
-        int4 src2_data = (int4)(src2.x, src2.y, src2.x, src2.y);
-
-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = convert_uchar4_sat(abs_diff(convert_int4_sat(src1_data), src2_data));
-
-        data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy;
-        data.zw = (dst_index + 2 <  dst_end  ) ? tmp_data.zw : data.zw;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_absdiff_C2_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                      __global   ushort *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = (int2)(src2.x, src2.y);
-        ushort2 dst_data  = *((__global ushort2 *)((__global char *)dst  + dst_index));
-
-        ushort2 data = convert_ushort2_sat( abs_diff(convert_int2_sat(src_data1), src_data2));
-
-        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_absdiff_C2_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                      __global   short *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = (int2)(src2.x, src2.y);
-        short2 dst_data  = *((__global short2 *)((__global char *)dst  + dst_index));
-
-        ushort2 tmp = convert_ushort2_sat(abs_diff(convert_int2_sat(src_data1), src_data2));
-        short2 data = convert_short2_sat(tmp);
-
-        *((__global short2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_absdiff_C2_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                      __global   int *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = (int2)(src2.x, src2.y);
-        int2 dst_data  = *((__global int2 *)((__global char *)dst  + dst_index));
-
-        int2 data = convert_int2_sat(abs_diff(src_data1, src_data2));
-        *((__global int2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_absdiff_C2_D5 (__global   float *src1, int src1_step, int src1_offset,
-                                      __global   float *dst,  int dst_step,  int dst_offset,
-                                      float4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        float2 src_data1 = *((__global float2 *)((__global char *)src1 + src1_index));
-        float2 src_data2 = (float2)(src2.x, src2.y);
-        float2 dst_data  = *((__global float2 *)((__global char *)dst  + dst_index));
-
-        float2 data = fabs(src_data1 - src_data2);
-        *((__global float2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_absdiff_C2_D6 (__global   double *src1, int src1_step, int src1_offset,
-                                      __global   double *dst,  int dst_step,  int dst_offset,
-                                      double4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        double2 src_data1 = *((__global double2 *)((__global char *)src1 + src1_index));
-        double2 src_data2 = (double2)(src2.x, src2.y);
-        double2 dst_data  = *((__global double2 *)((__global char *)dst  + dst_index));
-
-        double2 data = fabs(src_data1 - src_data2);
-
-        *((__global double2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#endif
-__kernel void arithm_s_absdiff_C3_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                      __global   uchar *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
-        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
-
-        uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0);
-        uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
-        uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
-
-        int4 src2_data_0 = (int4)(src2.x, src2.y, src2.z, src2.x);
-        int4 src2_data_1 = (int4)(src2.y, src2.z, src2.x, src2.y);
-        int4 src2_data_2 = (int4)(src2.z, src2.x, src2.y, src2.z);
-
-        uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
-        uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
-        uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8));
-
-        uchar4 tmp_data_0 = convert_uchar4_sat(abs_diff(convert_int4_sat(src1_data_0), src2_data_0));
-        uchar4 tmp_data_1 = convert_uchar4_sat(abs_diff(convert_int4_sat(src1_data_1), src2_data_1));
-        uchar4 tmp_data_2 = convert_uchar4_sat(abs_diff(convert_int4_sat(src1_data_2), src2_data_2));
-
-        data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
-        data_0.w   = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
-                     ? tmp_data_0.w : data_0.w;
-
-        data_1.xy  = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
-                     ? tmp_data_1.xy : data_1.xy;
-        data_1.zw  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.zw : data_1.zw;
-
-        data_2.x   = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.x : data_2.x;
-        data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
-                     ? tmp_data_2.yzw : data_2.yzw;
-
-        *((__global uchar4 *)(dst + dst_index + 0)) = data_0;
-        *((__global uchar4 *)(dst + dst_index + 4)) = data_1;
-        *((__global uchar4 *)(dst + dst_index + 8)) = data_2;
-    }
-}
-__kernel void arithm_s_absdiff_C3_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                      __global   ushort *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
-        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
-
-        ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0));
-        ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4));
-        ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8));
-
-        int2 src2_data_0 = (int2)(src2.x, src2.y);
-        int2 src2_data_1 = (int2)(src2.z, src2.x);
-        int2 src2_data_2 = (int2)(src2.y, src2.z);
-
-        ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0));
-        ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4));
-        ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8));
-
-        ushort2 tmp_data_0 = convert_ushort2_sat(abs_diff(convert_int2_sat(src1_data_0), src2_data_0));
-        ushort2 tmp_data_1 = convert_ushort2_sat(abs_diff(convert_int2_sat(src1_data_1), src2_data_1));
-        ushort2 tmp_data_2 = convert_ushort2_sat(abs_diff(convert_int2_sat(src1_data_2), src2_data_2));
-
-        data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
-
-        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                    ? tmp_data_1.x : data_1.x;
-        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                    ? tmp_data_1.y : data_1.y;
-
-        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                    ? tmp_data_2.xy : data_2.xy;
-
-        *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
-        *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
-        *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
-    }
-}
-__kernel void arithm_s_absdiff_C3_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                      __global   short *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
-        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
-
-        short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0));
-        short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4));
-        short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8));
-
-        int2 src2_data_0 = (int2)(src2.x, src2.y);
-        int2 src2_data_1 = (int2)(src2.z, src2.x);
-        int2 src2_data_2 = (int2)(src2.y, src2.z);
-
-        short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0));
-        short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4));
-        short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8));
-
-        short2 tmp_data_0 = convert_short2_sat(abs_diff(convert_int2_sat(src1_data_0), src2_data_0));
-        short2 tmp_data_1 = convert_short2_sat(abs_diff(convert_int2_sat(src1_data_1), src2_data_1));
-        short2 tmp_data_2 = convert_short2_sat(abs_diff(convert_int2_sat(src1_data_2), src2_data_2));
-
-        data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
-
-        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                    ? tmp_data_1.x : data_1.x;
-        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                    ? tmp_data_1.y : data_1.y;
-
-        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                    ? tmp_data_2.xy : data_2.xy;
-
-        *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
-        *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
-        *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
-    }
-}
-__kernel void arithm_s_absdiff_C3_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                      __global   int *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));
-
-        int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
-        int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4));
-        int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8));
-
-        int src2_data_0 = src2.x;
-        int src2_data_1 = src2.y;
-        int src2_data_2 = src2.z;
-
-        int data_0 = *((__global int *)((__global char *)dst + dst_index + 0));
-        int data_1 = *((__global int *)((__global char *)dst + dst_index + 4));
-        int data_2 = *((__global int *)((__global char *)dst + dst_index + 8));
-
-        int tmp_data_0 = convert_int_sat(abs_diff(src1_data_0, src2_data_0));
-        int tmp_data_1 = convert_int_sat(abs_diff(src1_data_1, src2_data_1));
-        int tmp_data_2 = convert_int_sat(abs_diff(src1_data_2, src2_data_2));
-
-        *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
-        *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
-        *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
-    }
-}
-__kernel void arithm_s_absdiff_C3_D5 (__global   float *src1, int src1_step, int src1_offset,
-                                      __global   float *dst,  int dst_step,  int dst_offset,
-                                      float4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));
-
-        float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0));
-        float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4));
-        float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8));
-
-        float src2_data_0 = src2.x;
-        float src2_data_1 = src2.y;
-        float src2_data_2 = src2.z;
-
-        float data_0 = *((__global float *)((__global char *)dst + dst_index + 0));
-        float data_1 = *((__global float *)((__global char *)dst + dst_index + 4));
-        float data_2 = *((__global float *)((__global char *)dst + dst_index + 8));
-
-        float tmp_data_0 = fabs(src1_data_0 - src2_data_0);
-        float tmp_data_1 = fabs(src1_data_1 - src2_data_1);
-        float tmp_data_2 = fabs(src1_data_2 - src2_data_2);
-
-        *((__global float *)((__global char *)dst + dst_index + 0))= tmp_data_0;
-        *((__global float *)((__global char *)dst + dst_index + 4))= tmp_data_1;
-        *((__global float *)((__global char *)dst + dst_index + 8))= tmp_data_2;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_absdiff_C3_D6 (__global   double *src1, int src1_step, int src1_offset,
-                                      __global   double *dst,  int dst_step,  int dst_offset,
-                                      double4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x * 24));
-
-        double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 ));
-        double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 ));
-        double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16));
-
-        double src2_data_0 = src2.x;
-        double src2_data_1 = src2.y;
-        double src2_data_2 = src2.z;
-
-        double data_0 = *((__global double *)((__global char *)dst + dst_index + 0 ));
-        double data_1 = *((__global double *)((__global char *)dst + dst_index + 8 ));
-        double data_2 = *((__global double *)((__global char *)dst + dst_index + 16));
-
-        double tmp_data_0 = fabs(src1_data_0 - src2_data_0);
-        double tmp_data_1 = fabs(src1_data_1 - src2_data_1);
-        double tmp_data_2 = fabs(src1_data_2 - src2_data_2);
-
-        *((__global double *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
-        *((__global double *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
-        *((__global double *)((__global char *)dst + dst_index + 16))= tmp_data_2;
-    }
-}
-#endif
-__kernel void arithm_s_absdiff_C4_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                      __global   uchar *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index));
-
-        uchar4 data = convert_uchar4_sat(abs_diff(convert_int4_sat(src_data1), src2));
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_absdiff_C4_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                      __global   ushort *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index));
-
-        ushort4 data = convert_ushort4_sat(abs_diff(convert_int4_sat(src_data1), src2));
-
-        *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_absdiff_C4_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                      __global   short *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index));
-
-        short4 data = convert_short4_sat(abs_diff(convert_int4_sat(src_data1), src2));
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_absdiff_C4_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                      __global   int *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index));
-
-        int4 data = convert_int4_sat(abs_diff(src_data1, src2));
-
-        *((__global int4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_absdiff_C4_D5 (__global   float *src1, int src1_step, int src1_offset,
-                                      __global   float *dst,  int dst_step,  int dst_offset,
-                                      float4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        float4 src_data1 = *((__global float4 *)((__global char *)src1 + src1_index));
-
-        float4 data = fabs(src_data1 - src2);
-
-        *((__global float4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_absdiff_C4_D6 (__global   double *src1, int src1_step, int src1_offset,
-                                      __global   double *dst,  int dst_step,  int dst_offset,
-                                      double4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 5) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 5) + dst_offset);
-
-        double4 src_data1 = *((__global double4 *)((__global char *)src1 + src1_index));
-
-        double4 data = fabs(src_data1 - src2);
-
-        *((__global double4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#endif

From 544c02407e29cbe3da5cd68073d359395f1448b1 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Tue, 24 Sep 2013 22:54:13 +0400
Subject: [PATCH 14/17] reverted and generalized original ocl::transpose

---
 modules/ocl/src/arithm.cpp                 | 12 ++--
 modules/ocl/src/opencl/arithm_minMax.cl    |  5 ++
 modules/ocl/src/opencl/arithm_transpose.cl | 67 ++++++++++++++++++++--
 modules/ocl/test/utility.hpp               |  2 +-
 4 files changed, 72 insertions(+), 14 deletions(-)

diff --git a/modules/ocl/src/arithm.cpp b/modules/ocl/src/arithm.cpp
index 035cea781..c1147cb41 100644
--- a/modules/ocl/src/arithm.cpp
+++ b/modules/ocl/src/arithm.cpp
@@ -64,7 +64,6 @@ namespace cv
     {
         //////////////////////////////// OpenCL kernel strings /////////////////////
 
-        extern const char *transpose_kernel;
         extern const char *arithm_nonzero;
         extern const char *arithm_sum;
         extern const char *arithm_sum_3;
@@ -1265,9 +1264,8 @@ int cv::ocl::countNonZero(const oclMat &src)
         CV_Error(CV_GpuNotSupported, "select device don't support double");
     }
     CV_Assert(groupnum != 0);
-    groupnum = groupnum * 2;
+//    groupnum = groupnum * 2;
     int vlen = 8 , dbsize = groupnum * vlen;
-    //cl_ulong start, end;
     Context *clCxt = src.clCxt;
     string kernelName = "arithm_op_nonzero";
     int *p = new int[dbsize], nonzero = 0;
@@ -1529,7 +1527,7 @@ oclMatExpr::operator oclMat() const
 #define TILE_DIM      (32)
 #define BLOCK_ROWS    (256/TILE_DIM)
 
-static void transpose_run(const oclMat &src, oclMat &dst, string kernelName)
+static void transpose_run(const oclMat &src, oclMat &dst, string kernelName, bool inplace = false)
 {
     Context  *clCxt = src.clCxt;
     if (!clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
@@ -1544,7 +1542,7 @@ static void transpose_run(const oclMat &src, oclMat &dst, string kernelName)
                                       channelsString[src.channels()]);
 
     size_t localThreads[3]  = { TILE_DIM, BLOCK_ROWS, 1 };
-    size_t globalThreads[3] = { src.cols, src.rows, 1 };
+    size_t globalThreads[3] = { src.cols, inplace ? src.rows : divUp(src.rows, TILE_DIM) * BLOCK_ROWS, 1 };
 
     int srcstep1 = src.step / src.elemSize(), dststep1 = dst.step / dst.elemSize();
     int srcoffset1 = src.offset / src.elemSize(), dstoffset1 = dst.offset / dst.elemSize();
@@ -1568,8 +1566,8 @@ void cv::ocl::transpose(const oclMat &src, oclMat &dst)
     CV_Assert(src.depth() <= CV_64F && src.channels() <= 4);
 
     if ( src.data == dst.data && src.cols == src.rows && dst.offset == src.offset
-            && dst.rows == dst.cols && src.cols == dst.cols)
-        transpose_run( src, dst, "transpose_inplace");
+         && dst.size() == src.size())
+        transpose_run( src, dst, "transpose_inplace", true);
     else
     {
         dst.create(src.cols, src.rows, src.type());
diff --git a/modules/ocl/src/opencl/arithm_minMax.cl b/modules/ocl/src/opencl/arithm_minMax.cl
index 1dcb138eb..23b293306 100644
--- a/modules/ocl/src/opencl/arithm_minMax.cl
+++ b/modules/ocl/src/opencl/arithm_minMax.cl
@@ -44,9 +44,14 @@
 //M*/
 
 /**************************************PUBLICFUNC*************************************/
+
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_amd_fp64
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#elif defined (cl_khr_fp64)
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
 #endif
+#endif
 
 #if defined (DEPTH_0)
 #define VEC_TYPE uchar8
diff --git a/modules/ocl/src/opencl/arithm_transpose.cl b/modules/ocl/src/opencl/arithm_transpose.cl
index 57f7f1b9d..5328d1f1b 100644
--- a/modules/ocl/src/opencl/arithm_transpose.cl
+++ b/modules/ocl/src/opencl/arithm_transpose.cl
@@ -44,23 +44,78 @@
 //M*/
 
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_amd_fp64
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#elif defined (cl_khr_fp64)
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
 #endif
+#endif
+
+#define TILE_DIM      32
+#define BLOCK_ROWS    8
+#define LDS_STEP      TILE_DIM
 
 __kernel void transpose(__global const T* src, __global T* dst,
     int src_cols, int src_rows,
     int src_step, int dst_step,
     int src_offset, int dst_offset)
 {
-    int x = get_global_id(0);
-    int y = get_global_id(1);
+    int gp_x = get_group_id(0),   gp_y = get_group_id(1);
+    int gs_x = get_num_groups(0), gs_y = get_num_groups(1);
+
+    int groupId_x, groupId_y;
+
+    if(src_rows == src_cols)
+    {
+        groupId_y = gp_x;
+        groupId_x = (gp_x + gp_y) % gs_x;
+    }
+    else
+    {
+        int bid = gp_x + gs_x * gp_y;
+        groupId_y =  bid % gs_y;
+        groupId_x = ((bid / gs_y) + groupId_y) % gs_x;
+    }
+
+    int lx = get_local_id(0);
+    int ly = get_local_id(1);
+
+    int x = groupId_x * TILE_DIM + lx;
+    int y = groupId_y * TILE_DIM + ly;
+
+    int x_index = groupId_y * TILE_DIM + lx;
+    int y_index = groupId_x * TILE_DIM + ly;
+
+    __local T title[TILE_DIM * LDS_STEP];
 
     if (x < src_cols && y < src_rows)
     {
-        int srcIdx = mad24(y, src_step, src_offset + x);
-        int dstIdx = mad24(x, dst_step, dst_offset + y);
+        int index_src = mad24(y, src_step, x);
 
-        dst[dstIdx] = src[srcIdx];
+        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
+        {
+            if (y + i < src_rows)
+            {
+                title[(ly + i) * LDS_STEP + lx] = src[src_offset + index_src];
+                index_src = mad24(BLOCK_ROWS, src_step, index_src);
+            }
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (x_index < src_rows && y_index < src_cols)
+    {
+        int index_dst = mad24(y_index, dst_step, x_index);
+
+        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
+        {
+            if ((y_index + i) < src_cols)
+            {
+                dst[dst_offset + index_dst] = title[lx * LDS_STEP + ly + i];
+                index_dst +=  dst_step * BLOCK_ROWS;
+            }
+        }
     }
 }
 
@@ -72,7 +127,7 @@ __kernel void transpose_inplace(__global T* src, __global T* dst,
     int x = get_global_id(0);
     int y = get_global_id(1);
 
-    if (x < src_cols && y < src_rows && x < y)
+    if (y < src_rows && x < y)
     {
         int srcIdx = mad24(y, src_step, src_offset + x);
         int dstIdx = mad24(x, dst_step, dst_offset + y);
diff --git a/modules/ocl/test/utility.hpp b/modules/ocl/test/utility.hpp
index 48c8bbcd9..7c491916f 100644
--- a/modules/ocl/test/utility.hpp
+++ b/modules/ocl/test/utility.hpp
@@ -48,7 +48,7 @@
 #define MHEIGHT 256
 
 #define MIN_VALUE 171
-#define MAX_VALUE 351
+#define MAX_VALUE 357
 
 //#define RANDOMROI
 int randomInt(int minVal, int maxVal);

From 0faac595a849c1868ed70a75e641b2663151c8eb Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Wed, 25 Sep 2013 15:02:47 +0400
Subject: [PATCH 15/17] fixed minMaxLoc kernel (removed compilation errors)

---
 modules/ocl/src/arithm.cpp                 | 126 ++++++-------
 modules/ocl/src/opencl/arithm_minMaxLoc.cl | 210 +++++----------------
 2 files changed, 104 insertions(+), 232 deletions(-)

diff --git a/modules/ocl/src/arithm.cpp b/modules/ocl/src/arithm.cpp
index c1147cb41..8d502ea56 100644
--- a/modules/ocl/src/arithm.cpp
+++ b/modules/ocl/src/arithm.cpp
@@ -341,7 +341,7 @@ static void arithmetic_sum_buffer_run(const oclMat &src, cl_mem &dst, int vlen ,
     args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data));
     args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst ));
     size_t gt[3] = {groupnum * 256, 1, 1}, lt[3] = {256, 1, 1};
-    if(src.oclchannels() != 3)
+    if (src.oclchannels() != 3)
         openCLExecuteKernel(src.clCxt, &arithm_sum, "arithm_op_sum", gt, lt, args, -1, -1, build_options);
     else
         openCLExecuteKernel(src.clCxt, &arithm_sum_3, "arithm_op_sum_3", gt, lt, args, -1, -1, build_options);
@@ -365,9 +365,9 @@ Scalar arithmetic_sum(const oclMat &src, int type = 0)
 
     memset(p, 0, dbsize * sizeof(T));
     openCLReadBuffer(clCxt, dstBuffer, (void *)p, dbsize * sizeof(T));
-    for(int i = 0; i < dbsize;)
+    for (int i = 0; i < dbsize;)
     {
-        for(int j = 0; j < src.oclchannels(); j++, i++)
+        for (int j = 0; j < src.oclchannels(); j++, i++)
             s.val[j] += p[i];
     }
     delete[] p;
@@ -378,9 +378,9 @@ Scalar arithmetic_sum(const oclMat &src, int type = 0)
 typedef Scalar (*sumFunc)(const oclMat &src, int type);
 Scalar cv::ocl::sum(const oclMat &src)
 {
-    if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
+    if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
     {
-        CV_Error(CV_GpuNotSupported, "select device don't support double");
+        CV_Error(CV_GpuNotSupported, "Selected device doesn't support double");
     }
     static sumFunc functab[2] =
     {
@@ -395,9 +395,9 @@ Scalar cv::ocl::sum(const oclMat &src)
 
 Scalar cv::ocl::absSum(const oclMat &src)
 {
-    if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
+    if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
     {
-        CV_Error(CV_GpuNotSupported, "select device don't support double");
+        CV_Error(CV_GpuNotSupported, "Selected device doesn't support double");
     }
     static sumFunc functab[2] =
     {
@@ -412,9 +412,9 @@ Scalar cv::ocl::absSum(const oclMat &src)
 
 Scalar cv::ocl::sqrSum(const oclMat &src)
 {
-    if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
+    if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
     {
-        CV_Error(CV_GpuNotSupported, "select device don't support double");
+        CV_Error(CV_GpuNotSupported, "Selected device doesn't support double");
     }
     static sumFunc functab[2] =
     {
@@ -446,7 +446,7 @@ void cv::ocl::meanStdDev(const oclMat &src, Scalar &mean, Scalar &stddev)
     m1 = (Mat)dst1;
     m2 = (Mat)dst2;
     int i = 0, *p = (int *)m1.data, *q = (int *)m2.data;
-    for(; i < channels; i++)
+    for (; i < channels; i++)
     {
         mean.val[i] = (double)p[i] / (src.cols * src.rows);
         stddev.val[i] = std::sqrt(std::max((double) q[i] / (src.cols * src.rows) - mean.val[i] * mean.val[i] , 0.));
@@ -476,7 +476,7 @@ static void arithmetic_minMax_run(const oclMat &src, const oclMat &mask, cl_mem
     args.push_back( make_pair( sizeof(cl_int) , (void *)&elemnum));
     args.push_back( make_pair( sizeof(cl_int) , (void *)&groupnum));
     args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data));
-    if(!mask.empty())
+    if (!mask.empty())
     {
         int mall_cols = mask.step / (vlen * mask.elemSize1());
         int mpre_cols = (mask.offset % mask.step) / (vlen * mask.elemSize1());
@@ -499,7 +499,7 @@ static void arithmetic_minMax_mask_run(const oclMat &src, const oclMat &mask, cl
     vector<pair<size_t , const void *> > args;
     size_t gt[3] = {groupnum * 256, 1, 1}, lt[3] = {256, 1, 1};
     char build_options[50];
-    if(src.oclchannels() == 1)
+    if (src.oclchannels() == 1)
     {
         int cols = (src.cols - 1) / vlen + 1;
         int invalid_cols = src.step / (vlen * src.elemSize1()) - cols;
@@ -519,8 +519,6 @@ static void arithmetic_minMax_mask_run(const oclMat &src, const oclMat &mask, cl
         args.push_back( make_pair( sizeof(cl_int) , (void *)&moffset ));
         args.push_back( make_pair( sizeof(cl_mem) , (void *)&mask.data ));
         args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst ));
-        //        printf("elemnum:%d,cols:%d,invalid_cols:%d,offset:%d,minvalid_cols:%d,moffset:%d,repeat_e:%d\r\n",
-        //               elemnum,cols,invalid_cols,offset,minvalid_cols,moffset,repeat_me);
         openCLExecuteKernel(src.clCxt, &arithm_minMax_mask, kernelName, gt, lt, args, -1, -1, build_options);
     }
 }
@@ -549,18 +547,18 @@ template <typename T> void arithmetic_minMax(const oclMat &src, double *minVal,
 
     Mat matbuf = Mat(buf);
     T *p = matbuf.ptr<T>();
-    if(minVal != NULL)
+    if (minVal != NULL)
     {
         *minVal = std::numeric_limits<double>::max();
-        for(int i = 0; i < vlen * (int)groupnum; i++)
+        for (int i = 0; i < vlen * (int)groupnum; i++)
         {
             *minVal = *minVal < p[i] ? *minVal : p[i];
         }
     }
-    if(maxVal != NULL)
+    if (maxVal != NULL)
     {
         *maxVal = -std::numeric_limits<double>::max();
-        for(int i = vlen * (int)groupnum; i < 2 * vlen * (int)groupnum; i++)
+        for (int i = vlen * (int)groupnum; i < 2 * vlen * (int)groupnum; i++)
         {
             *maxVal = *maxVal > p[i] ? *maxVal : p[i];
         }
@@ -577,9 +575,9 @@ void cv::ocl::minMax(const oclMat &src, double *minVal, double *maxVal, const oc
 void cv::ocl::minMax_buf(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask, oclMat &buf)
 {
     CV_Assert(src.oclchannels() == 1);
-    if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
+    if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
     {
-        CV_Error(CV_GpuNotSupported, "select device don't support double");
+        CV_Error(CV_GpuNotSupported, "Selected device doesn't support double");
     }
     static minMaxFunc functab[8] =
     {
@@ -625,7 +623,7 @@ double cv::ocl::norm(const oclMat &src1, const oclMat &src2, int normType)
         m = (gm2);
         p = (int *)m.data;
         r = -std::numeric_limits<double>::max();
-        for(i = 0; i < channels; i++)
+        for (i = 0; i < channels; i++)
         {
             r = std::max(r, (double)p[i]);
         }
@@ -635,7 +633,7 @@ double cv::ocl::norm(const oclMat &src1, const oclMat &src2, int normType)
         //arithmetic_sum_run(gm1, gm2,"arithm_op_sum");
         m = (gm2);
         p = (int *)m.data;
-        for(i = 0; i < channels; i++)
+        for (i = 0; i < channels; i++)
         {
             r = r + (double)p[i];
         }
@@ -645,14 +643,14 @@ double cv::ocl::norm(const oclMat &src1, const oclMat &src2, int normType)
         //arithmetic_sum_run(gm1, gm2,"arithm_op_squares_sum");
         m = (gm2);
         p = (int *)m.data;
-        for(i = 0; i < channels; i++)
+        for (i = 0; i < channels; i++)
         {
             r = r + (double)p[i];
         }
         r = std::sqrt(r);
         break;
     }
-    if(isRelative)
+    if (isRelative)
         r = r / norm(src2, normType);
     return r;
 }
@@ -663,9 +661,9 @@ double cv::ocl::norm(const oclMat &src1, const oclMat &src2, int normType)
 
 static void arithmetic_flip_rows_run(const oclMat &src, oclMat &dst, string kernelName)
 {
-    if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F)
+    if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F)
     {
-        CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
+        CV_Error(CV_GpuNotSupported, "Selected device doesn't support double\r\n");
         return;
     }
 
@@ -710,9 +708,9 @@ static void arithmetic_flip_rows_run(const oclMat &src, oclMat &dst, string kern
 
 static void arithmetic_flip_cols_run(const oclMat &src, oclMat &dst, string kernelName, bool isVertical)
 {
-    if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F)
+    if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F)
     {
-        CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
+        CV_Error(CV_GpuNotSupported, "Selected device doesn't support double\r\n");
         return;
     }
 
@@ -749,7 +747,7 @@ static void arithmetic_flip_cols_run(const oclMat &src, oclMat &dst, string kern
     args.push_back( make_pair( sizeof(cl_int), (void *)&dst.rows ));
     args.push_back( make_pair( sizeof(cl_int), (void *)&dst.cols ));
 
-    if(isVertical)
+    if (isVertical)
         args.push_back( make_pair( sizeof(cl_int), (void *)&rows ));
     else
         args.push_back( make_pair( sizeof(cl_int), (void *)&cols ));
@@ -764,11 +762,11 @@ static void arithmetic_flip_cols_run(const oclMat &src, oclMat &dst, string kern
 void cv::ocl::flip(const oclMat &src, oclMat &dst, int flipCode)
 {
     dst.create(src.size(), src.type());
-    if(flipCode == 0)
+    if (flipCode == 0)
     {
         arithmetic_flip_rows_run(src, dst, "arithm_flip_rows");
     }
-    else if(flipCode > 0)
+    else if (flipCode > 0)
         arithmetic_flip_cols_run(src, dst, "arithm_flip_cols", false);
     else
         arithmetic_flip_cols_run(src, dst, "arithm_flip_rc", true);
@@ -877,9 +875,9 @@ void cv::ocl::log(const oclMat &src, oclMat &dst)
 
 static void arithmetic_magnitude_phase_run(const oclMat &src1, const oclMat &src2, oclMat &dst, string kernelName)
 {
-    if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
+    if (!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
     {
-        CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
+        CV_Error(CV_GpuNotSupported, "Selected device doesn't support double\r\n");
         return;
     }
 
@@ -921,9 +919,9 @@ void cv::ocl::magnitude(const oclMat &src1, const oclMat &src2, oclMat &dst)
 
 static void arithmetic_phase_run(const oclMat &src1, const oclMat &src2, oclMat &dst, string kernelName, const char **kernelString)
 {
-    if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
+    if (!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
     {
-        CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
+        CV_Error(CV_GpuNotSupported, "Selected device doesn't support double\r\n");
         return;
     }
 
@@ -964,7 +962,7 @@ void cv::ocl::phase(const oclMat &x, const oclMat &y, oclMat &Angle , bool angle
     CV_Assert(x.type() == y.type() && x.size() == y.size() && (x.depth() == CV_32F || x.depth() == CV_64F));
     Angle.create(x.size(), x.type());
     string kernelName = angleInDegrees ? "arithm_phase_indegrees" : "arithm_phase_inradians";
-    if(angleInDegrees)
+    if (angleInDegrees)
         arithmetic_phase_run(x, y, Angle, kernelName, &arithm_phase);
     else
         arithmetic_phase_run(x, y, Angle, kernelName, &arithm_phase);
@@ -977,9 +975,9 @@ void cv::ocl::phase(const oclMat &x, const oclMat &y, oclMat &Angle , bool angle
 static void arithmetic_cartToPolar_run(const oclMat &src1, const oclMat &src2, oclMat &dst_mag, oclMat &dst_cart,
                                 string kernelName, bool angleInDegrees)
 {
-    if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
+    if (!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
     {
-        CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
+        CV_Error(CV_GpuNotSupported, "Selected device doesn't support double\r\n");
         return;
     }
 
@@ -1030,9 +1028,9 @@ void cv::ocl::cartToPolar(const oclMat &x, const oclMat &y, oclMat &mag, oclMat
 static void arithmetic_ptc_run(const oclMat &src1, const oclMat &src2, oclMat &dst1, oclMat &dst2, bool angleInDegrees,
                         string kernelName)
 {
-    if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
+    if (!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
     {
-        CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
+        CV_Error(CV_GpuNotSupported, "Selected device doesn't support double\r\n");
         return;
     }
 
@@ -1048,7 +1046,7 @@ static void arithmetic_ptc_run(const oclMat &src1, const oclMat &src2, oclMat &d
 
     int tmp = angleInDegrees ? 1 : 0;
     vector<pair<size_t , const void *> > args;
-    if(src1.data)
+    if (src1.data)
     {
         args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data ));
         args.push_back( make_pair( sizeof(cl_int), (void *)&src1.step ));
@@ -1077,7 +1075,7 @@ void cv::ocl::polarToCart(const oclMat &magnitude, const oclMat &angle, oclMat &
     x.create(angle.size(), angle.type());
     y.create(angle.size(), angle.type());
 
-    if( magnitude.data )
+    if ( magnitude.data )
     {
         CV_Assert( magnitude.size() == angle.size() && magnitude.type() == angle.type() );
         arithmetic_ptc_run(magnitude, angle, x, y, angleInDegrees, "arithm_polarToCart_mag");
@@ -1119,7 +1117,7 @@ static void arithmetic_minMaxLoc_mask_run(const oclMat &src, const oclMat &mask,
     vector<pair<size_t , const void *> > args;
     size_t gt[3] = {groupnum * 256, 1, 1}, lt[3] = {256, 1, 1};
     char build_options[50];
-    if(src.oclchannels() == 1)
+    if (src.oclchannels() == 1)
     {
         int cols = (src.cols - 1) / vlen + 1;
         int invalid_cols = src.step / (vlen * src.elemSize1()) - cols;
@@ -1143,7 +1141,8 @@ static void arithmetic_minMaxLoc_mask_run(const oclMat &src, const oclMat &mask,
         openCLExecuteKernel(src.clCxt, &arithm_minMaxLoc_mask, "arithm_op_minMaxLoc_mask", gt, lt, args, -1, -1, build_options);
     }
 }
-template<typename T>
+
+template <typename T>
 void arithmetic_minMaxLoc(const oclMat &src, double *minVal, double *maxVal,
                           Point *minLoc, Point *maxLoc, const oclMat &mask)
 {
@@ -1164,12 +1163,12 @@ void arithmetic_minMaxLoc(const oclMat &src, double *minVal, double *maxVal,
     T *p = new T[groupnum * vlen * 4];
     memset(p, 0, dbsize);
     openCLReadBuffer(clCxt, dstBuffer, (void *)p, dbsize);
-    for(int i = 0; i < vlen * (int)groupnum; i++)
+    for (int i = 0; i < vlen * (int)groupnum; i++)
     {
         *minVal = (*minVal < p[i] || p[i + 2 * vlen * groupnum] == -1) ? *minVal : p[i];
         minloc = (*minVal < p[i] || p[i + 2 * vlen * groupnum] == -1) ? minloc : cvRound(p[i + 2 * vlen * groupnum]);
     }
-    for(int i = vlen * (int)groupnum; i < 2 * vlen * (int)groupnum; i++)
+    for (int i = vlen * (int)groupnum; i < 2 * vlen * (int)groupnum; i++)
     {
         *maxVal = (*maxVal > p[i] || p[i + 2 * vlen * groupnum] == -1) ? *maxVal : p[i];
         maxloc = (*maxVal > p[i] || p[i + 2 * vlen * groupnum] == -1) ? maxloc : cvRound(p[i + 2 * vlen * groupnum]);
@@ -1178,9 +1177,9 @@ void arithmetic_minMaxLoc(const oclMat &src, double *minVal, double *maxVal,
     int pre_rows = src.offset / src.step;
     int pre_cols = (src.offset % src.step) / src.elemSize1();
     int wholecols = src.step / src.elemSize1();
-    if( minLoc )
+    if ( minLoc )
     {
-        if( minloc >= 0 )
+        if ( minloc >= 0 )
         {
             minLoc->y = minloc / wholecols - pre_rows;
             minLoc->x = minloc % wholecols - pre_cols;
@@ -1188,9 +1187,9 @@ void arithmetic_minMaxLoc(const oclMat &src, double *minVal, double *maxVal,
         else
             minLoc->x = minLoc->y = -1;
     }
-    if( maxLoc )
+    if ( maxLoc )
     {
-        if( maxloc >= 0 )
+        if ( maxloc >= 0 )
         {
             maxLoc->y = maxloc / wholecols - pre_rows;
             maxLoc->x = maxloc % wholecols - pre_cols;
@@ -1209,9 +1208,9 @@ typedef void (*minMaxLocFunc)(const oclMat &src, double *minVal, double *maxVal,
 void cv::ocl::minMaxLoc(const oclMat &src, double *minVal, double *maxVal,
                         Point *minLoc, Point *maxLoc, const oclMat &mask)
 {
-    if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
+    if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
     {
-        CV_Error(CV_GpuNotSupported, "select device don't support double");
+        CV_Error(CV_GpuNotSupported, "Selected device doesn't support double");
         return;
     }
 
@@ -1259,12 +1258,11 @@ static void arithmetic_countNonZero_run(const oclMat &src, cl_mem &dst, int vlen
 int cv::ocl::countNonZero(const oclMat &src)
 {
     size_t groupnum = src.clCxt->computeUnits();
-    if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
+    if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
     {
-        CV_Error(CV_GpuNotSupported, "select device don't support double");
+        CV_Error(CV_GpuNotSupported, "selected device doesn't support double");
     }
     CV_Assert(groupnum != 0);
-//    groupnum = groupnum * 2;
     int vlen = 8 , dbsize = groupnum * vlen;
     Context *clCxt = src.clCxt;
     string kernelName = "arithm_op_nonzero";
@@ -1274,7 +1272,7 @@ int cv::ocl::countNonZero(const oclMat &src)
 
     memset(p, 0, dbsize * sizeof(int));
     openCLReadBuffer(clCxt, dstBuffer, (void *)p, dbsize * sizeof(int));
-    for(int i = 0; i < dbsize; i++)
+    for (int i = 0; i < dbsize; i++)
         nonzero += p[i];
 
     delete[] p;
@@ -1677,7 +1675,7 @@ static void arithmetic_pow_run(const oclMat &src1, double p, oclMat &dst, string
     args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 ));
 
     float pf = static_cast<float>(p);
-    if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE))
+    if (!src1.clCxt->supportsFeature(Context::CL_DOUBLE))
         args.push_back( make_pair( sizeof(cl_float), (void *)&pf ));
     else
         args.push_back( make_pair( sizeof(cl_double), (void *)&p ));
@@ -1687,7 +1685,7 @@ static void arithmetic_pow_run(const oclMat &src1, double p, oclMat &dst, string
 
 void cv::ocl::pow(const oclMat &x, double p, oclMat &y)
 {
-    if(!x.clCxt->supportsFeature(Context::CL_DOUBLE) && x.type() == CV_64F)
+    if (!x.clCxt->supportsFeature(Context::CL_DOUBLE) && x.type() == CV_64F)
     {
         cout << "Selected device do not support double" << endl;
         return;
@@ -1714,14 +1712,14 @@ void cv::ocl::setIdentity(oclMat& src, double scalar)
     size_t global_threads[] = {src.cols, src.rows, 1};
 
     string kernelName = "setIdentityKernel";
-    if(src.type() == CV_32FC1)
+    if (src.type() == CV_32FC1)
         kernelName += "_F1";
-    else if(src.type() == CV_32SC1)
+    else if (src.type() == CV_32SC1)
         kernelName += "_I1";
     else
     {
         kernelName += "_D1";
-        if(!(clCxt->supportsFeature(Context::CL_DOUBLE)))
+        if (!(clCxt->supportsFeature(Context::CL_DOUBLE)))
         {
             oclMat temp;
             src.convertTo(temp, CV_32FC1);
@@ -1738,9 +1736,9 @@ void cv::ocl::setIdentity(oclMat& src, double scalar)
 
     int scalar_i = 0;
     float scalar_f = 0.0f;
-    if(clCxt->supportsFeature(Context::CL_DOUBLE))
+    if (clCxt->supportsFeature(Context::CL_DOUBLE))
     {
-        if(src.type() == CV_32SC1)
+        if (src.type() == CV_32SC1)
         {
             scalar_i = (int)scalar;
             args.push_back(make_pair(sizeof(cl_int), (void*)&scalar_i));
@@ -1750,7 +1748,7 @@ void cv::ocl::setIdentity(oclMat& src, double scalar)
     }
     else
     {
-        if(src.type() == CV_32SC1)
+        if (src.type() == CV_32SC1)
         {
             scalar_i = (int)scalar;
             args.push_back(make_pair(sizeof(cl_int), (void*)&scalar_i));
diff --git a/modules/ocl/src/opencl/arithm_minMaxLoc.cl b/modules/ocl/src/opencl/arithm_minMaxLoc.cl
index 94cc14d25..848aac319 100644
--- a/modules/ocl/src/opencl/arithm_minMaxLoc.cl
+++ b/modules/ocl/src/opencl/arithm_minMaxLoc.cl
@@ -142,29 +142,35 @@
 #pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics:enable
 
 /**************************************Array minMax**************************************/
-__kernel void arithm_op_minMaxLoc (int cols,int invalid_cols,int offset,int elemnum,int groupnum,
+
+__kernel void arithm_op_minMaxLoc(int cols, int invalid_cols, int offset, int elemnum, int groupnum,
                                   __global VEC_TYPE *src, __global RES_TYPE *dst)
 {
    unsigned int lid = get_local_id(0);
    unsigned int gid = get_group_id(0);
    unsigned int  id = get_global_id(0);
    unsigned int idx = offset + id + (id / cols) * invalid_cols;
-   __local VEC_TYPE localmem_max[128],localmem_min[128];
-   VEC_TYPE minval,maxval,temp;
-   __local VEC_TYPE_LOC localmem_maxloc[128],localmem_minloc[128];
-   VEC_TYPE_LOC minloc,maxloc,temploc,negative = -1;
+
+   __local VEC_TYPE localmem_max[128], localmem_min[128];
+   VEC_TYPE minval, maxval, temp;
+
+   __local VEC_TYPE_LOC localmem_maxloc[128], localmem_minloc[128];
+   VEC_TYPE_LOC minloc, maxloc, temploc, negative = -1;
+
    int idx_c;
-   if(id < elemnum)
+
+   if (id < elemnum)
    {
        temp = src[idx];
        idx_c = idx << 2;
-       temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3);
-       if(id % cols == 0 )
+       temploc = (VEC_TYPE_LOC)(idx_c, idx_c + 1, idx_c + 2, idx_c + 3);
+
+       if (id % cols == 0 )
        {
            repeat_s(temp);
            repeat_s(temploc);
        }
-       if(id % cols == cols - 1)
+       if (id % cols == cols - 1)
        {
            repeat_e(temp);
            repeat_e(temploc);
@@ -181,31 +187,33 @@ __kernel void arithm_op_minMaxLoc (int cols,int invalid_cols,int offset,int elem
        minloc = negative;
        maxloc = negative;
    }
-   float4 aaa;
-   for(id=id + (groupnum << 8); id < elemnum;id = id + (groupnum << 8))
+
+   int grainSize = (groupnum << 8);
+   for (id = id + grainSize; id < elemnum; id = id + grainSize)
    {
        idx = offset + id + (id / cols) * invalid_cols;
        temp = src[idx];
        idx_c = idx << 2;
-       temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3);
-       if(id % cols == 0 )
+       temploc = (VEC_TYPE_LOC)(idx_c, idx_c+1, idx_c+2, idx_c+3);
+
+       if (id % cols == 0 )
        {
                repeat_s(temp);
                repeat_s(temploc);
        }
-       if(id % cols == cols - 1)
+       if (id % cols == cols - 1)
        {
                repeat_e(temp);
                repeat_e(temploc);
        }
-       minval = min(minval,temp);
-       maxval = max(maxval,temp);
-       minloc = CONDITION_FUNC(minval == temp, temploc , minloc);
-       maxloc = CONDITION_FUNC(maxval == temp, temploc , maxloc);
-       aaa= convert_float4(maxval == temp);
-       maxloc = convert_int4(aaa) ? temploc : maxloc;
+
+       minval = min(minval, temp);
+       maxval = max(maxval, temp);
+       minloc = CONDITION_FUNC(minval == temp, temploc, minloc);
+       maxloc = CONDITION_FUNC(maxval == temp, temploc, maxloc);
    }
-   if(lid > 127)
+
+   if (lid > 127)
    {
        localmem_min[lid - 128] = minval;
        localmem_max[lid - 128] = maxval;
@@ -213,29 +221,30 @@ __kernel void arithm_op_minMaxLoc (int cols,int invalid_cols,int offset,int elem
        localmem_maxloc[lid - 128] = maxloc;
    }
    barrier(CLK_LOCAL_MEM_FENCE);
-   if(lid < 128)
+
+   if (lid < 128)
    {
        localmem_min[lid] = min(minval,localmem_min[lid]);
        localmem_max[lid] = max(maxval,localmem_max[lid]);
-       localmem_minloc[lid] = CONDITION_FUNC(localmem_min[lid] == minval, minloc , localmem_minloc[lid]);
-       localmem_maxloc[lid] = CONDITION_FUNC(localmem_max[lid] == maxval, maxloc , localmem_maxloc[lid]);
+       localmem_minloc[lid] = CONDITION_FUNC(localmem_min[lid] == minval, minloc, localmem_minloc[lid]);
+       localmem_maxloc[lid] = CONDITION_FUNC(localmem_max[lid] == maxval, maxloc, localmem_maxloc[lid]);
    }
    barrier(CLK_LOCAL_MEM_FENCE);
-   for(int lsize = 64; lsize > 0; lsize >>= 1)
+
+   for (int lsize = 64; lsize > 0; lsize >>= 1)
    {
-       if(lid < lsize)
+       if (lid < lsize)
        {
            int lid2 = lsize + lid;
-           localmem_min[lid] = min(localmem_min[lid] , localmem_min[lid2]);
-           localmem_max[lid] = max(localmem_max[lid] , localmem_max[lid2]);
-           localmem_minloc[lid] =
-                   CONDITION_FUNC(localmem_min[lid] == localmem_min[lid2], localmem_minloc[lid2] , localmem_minloc[lid]);
-           localmem_maxloc[lid] =
-                   CONDITION_FUNC(localmem_max[lid] == localmem_max[lid2], localmem_maxloc[lid2] , localmem_maxloc[lid]);
+           localmem_min[lid] = min(localmem_min[lid], localmem_min[lid2]);
+           localmem_max[lid] = max(localmem_max[lid], localmem_max[lid2]);
+           localmem_minloc[lid] = CONDITION_FUNC(localmem_min[lid] == localmem_min[lid2], localmem_minloc[lid2], localmem_minloc[lid]);
+           localmem_maxloc[lid] = CONDITION_FUNC(localmem_max[lid] == localmem_max[lid2], localmem_maxloc[lid2], localmem_maxloc[lid]);
        }
        barrier(CLK_LOCAL_MEM_FENCE);
    }
-   if( lid == 0)
+
+   if ( lid == 0)
    {
        dst[gid] = CONVERT_RES_TYPE(localmem_min[0]);
        dst[gid + groupnum] = CONVERT_RES_TYPE(localmem_max[0]);
@@ -243,138 +252,3 @@ __kernel void arithm_op_minMaxLoc (int cols,int invalid_cols,int offset,int elem
        dst[gid + 3 * groupnum] = CONVERT_RES_TYPE(localmem_maxloc[0]);
    }
 }
-
-#if defined (REPEAT_S0)
-#define repeat_ms(a) a = a;
-#endif
-#if defined (REPEAT_S1)
-#define repeat_ms(a) a.s0 = 0;
-#endif
-#if defined (REPEAT_S2)
-#define repeat_ms(a) a.s0 = 0;a.s1 = 0;
-#endif
-#if defined (REPEAT_S3)
-#define repeat_ms(a) a.s0 = 0;a.s1 = 0;a.s2 = 0;
-#endif
-
-#if defined (REPEAT_E0)
-#define repeat_me(a) a = a;
-#endif
-#if defined (REPEAT_E1)
-#define repeat_me(a) a.s3 = 0;
-#endif
-#if defined (REPEAT_E2)
-#define repeat_me(a) a.s3 = 0;a.s2 = 0;
-#endif
-#if defined (REPEAT_E3)
-#define repeat_me(a) a.s3 = 0;a.s2 = 0;a.s1 = 0;
-#endif
-
-
-/**************************************Array minMaxLoc mask**************************************/
-/*
-__kernel void arithm_op_minMaxLoc_mask (int cols,int invalid_cols,int offset,int elemnum,int groupnum,__global VEC_TYPE *src,
-                                        int minvalid_cols,int moffset,__global uchar4 *mask,__global RES_TYPE  *dst)
-{
-   unsigned int lid = get_local_id(0);
-   unsigned int gid = get_group_id(0);
-   unsigned int  id = get_global_id(0);
-   unsigned int idx = offset + id + (id / cols) * invalid_cols;
-   unsigned int midx = moffset + id + (id / cols) * minvalid_cols;
-   __local VEC_TYPE localmem_max[128],localmem_min[128];
-   VEC_TYPE minval,maxval,temp,max_val = MAX_VAL,min_val = MIN_VAL,zero = 0,m_temp;
-   __local VEC_TYPE_LOC localmem_maxloc[128],localmem_minloc[128];
-   VEC_TYPE_LOC minloc,maxloc,temploc,negative = -1;
-   if(id < elemnum)
-   {
-       temp = src[idx];
-       m_temp = CONVERT_TYPE(mask[midx]);
-       int idx_c = idx << 2;
-       temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3);
-       if(id % cols == 0 )
-       {
-           repeat_ms(m_temp);
-           repeat_s(temploc);
-       }
-       if(id % cols == cols - 1)
-       {
-           repeat_me(m_temp);
-           repeat_e(temploc);
-       }
-       minval = m_temp > zero ? temp : max_val;
-       maxval = m_temp > zero ? temp : min_val;
-       minloc = CONDITION_FUNC(m_temp > zero, temploc , negative);
-       maxloc = minloc;
-   }
-   else
-   {
-       minval = MAX_VAL;
-       maxval = MIN_VAL;
-       minloc = negative;
-       maxloc = negative;
-   }
-   for(id=id + (groupnum << 8); id < elemnum;id = id + (groupnum << 8))
-   {
-       idx = offset + id + (id / cols) * invalid_cols;
-       midx = moffset + id + (id / cols) * minvalid_cols;
-       temp = src[idx];
-       m_temp = CONVERT_TYPE(mask[midx]);
-       int idx_c = idx << 2;
-       temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3);
-       if(id % cols == 0 )
-       {
-           repeat_ms(m_temp);
-           repeat_s(temploc);
-       }
-       if(id % cols == cols - 1)
-       {
-           repeat_me(m_temp);
-           repeat_e(temploc);
-       }
-       minval = min(minval,m_temp > zero ? temp : max_val);
-       maxval = max(maxval,m_temp > zero ? temp : min_val);
-
-       temploc = CONDITION_FUNC(m_temp > zero, temploc , negative);
-       minloc = CONDITION_FUNC(minval == temp, temploc , minloc);
-       maxloc = CONDITION_FUNC(maxval == temp, temploc , maxloc);
-   }
-   if(lid > 127)
-   {
-       localmem_min[lid - 128] = minval;
-       localmem_max[lid - 128] = maxval;
-       localmem_minloc[lid - 128] = minloc;
-       localmem_maxloc[lid - 128] = maxloc;
-   }
-   barrier(CLK_LOCAL_MEM_FENCE);
-   if(lid < 128)
-   {
-       localmem_min[lid] = min(minval,localmem_min[lid]);
-       localmem_max[lid] = max(maxval,localmem_max[lid]);
-       localmem_minloc[lid] = CONDITION_FUNC(localmem_min[lid] == minval, minloc , localmem_minloc[lid]);
-       localmem_maxloc[lid] = CONDITION_FUNC(localmem_max[lid] == maxval, maxloc , localmem_maxloc[lid]);
-   }
-   barrier(CLK_LOCAL_MEM_FENCE);
-   for(int lsize = 64; lsize > 0; lsize >>= 1)
-   {
-       if(lid < lsize)
-       {
-           int lid2 = lsize + lid;
-           localmem_min[lid] = min(localmem_min[lid] , localmem_min[lid2]);
-           localmem_max[lid] = max(localmem_max[lid] , localmem_max[lid2]);
-           localmem_minloc[lid] =
-                   CONDITION_FUNC(localmem_min[lid] == localmem_min[lid2], localmem_minloc[lid2] , localmem_minloc[lid]);
-           localmem_maxloc[lid] =
-                   CONDITION_FUNC(localmem_max[lid] == localmem_max[lid2], localmem_maxloc[lid2] , localmem_maxloc[lid]);
-       }
-       barrier(CLK_LOCAL_MEM_FENCE);
-   }
-   if( lid == 0)
-   {
-       dst[gid] = CONVERT_RES_TYPE(localmem_min[0]);
-       dst[gid + groupnum] = CONVERT_RES_TYPE(localmem_max[0]);
-       dst[gid + 2 * groupnum] = CONVERT_RES_TYPE(localmem_minloc[0]);
-       dst[gid + 3 * groupnum] = CONVERT_RES_TYPE(localmem_maxloc[0]);
-   }
-}
-
-*/

From c8821bd90914ab3682d47061fa5acada01b1ddb9 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Wed, 25 Sep 2013 15:28:12 +0400
Subject: [PATCH 16/17] replaced manually new/delete by AutoBuffer

---
 modules/ocl/src/arithm.cpp | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/modules/ocl/src/arithm.cpp b/modules/ocl/src/arithm.cpp
index 8d502ea56..0dd695bfa 100644
--- a/modules/ocl/src/arithm.cpp
+++ b/modules/ocl/src/arithm.cpp
@@ -354,13 +354,11 @@ Scalar arithmetic_sum(const oclMat &src, int type = 0)
     CV_Assert(groupnum != 0);
     int vlen = src.oclchannels() == 3 ? 12 : 8, dbsize = groupnum * vlen;
     Context *clCxt = src.clCxt;
-    T *p = new T[dbsize];
+
+    AutoBuffer<T> _buf(dbsize);
+    T *p = (T*)_buf;
     cl_mem dstBuffer = openCLCreateBuffer(clCxt, CL_MEM_WRITE_ONLY, dbsize * sizeof(T));
-    Scalar s;
-    s.val[0] = 0.0;
-    s.val[1] = 0.0;
-    s.val[2] = 0.0;
-    s.val[3] = 0.0;
+    Scalar s = Scalar::all(0.0);
     arithmetic_sum_buffer_run(src, dstBuffer, vlen, groupnum, type);
 
     memset(p, 0, dbsize * sizeof(T));
@@ -370,7 +368,7 @@ Scalar arithmetic_sum(const oclMat &src, int type = 0)
         for (int j = 0; j < src.oclchannels(); j++, i++)
             s.val[j] += p[i];
     }
-    delete[] p;
+
     openCLFree(dstBuffer);
     return s;
 }
@@ -1160,8 +1158,10 @@ void arithmetic_minMaxLoc(const oclMat &src, double *minVal, double *maxVal,
     else
         arithmetic_minMaxLoc_mask_run(src, mask, dstBuffer, vlen, groupnum);
 
-    T *p = new T[groupnum * vlen * 4];
+    AutoBuffer<T> _buf(groupnum * vlen * 4);
+    T *p = (T*)_buf;
     memset(p, 0, dbsize);
+
     openCLReadBuffer(clCxt, dstBuffer, (void *)p, dbsize);
     for (int i = 0; i < vlen * (int)groupnum; i++)
     {
@@ -1197,7 +1197,6 @@ void arithmetic_minMaxLoc(const oclMat &src, double *minVal, double *maxVal,
         else
             maxLoc->x = maxLoc->y = -1;
     }
-    delete[] p;
 
     openCLSafeCall(clReleaseMemObject(dstBuffer));
 }
@@ -1266,7 +1265,9 @@ int cv::ocl::countNonZero(const oclMat &src)
     int vlen = 8 , dbsize = groupnum * vlen;
     Context *clCxt = src.clCxt;
     string kernelName = "arithm_op_nonzero";
-    int *p = new int[dbsize], nonzero = 0;
+
+    AutoBuffer<int> _buf(dbsize);
+    int *p = (int*)_buf, nonzero = 0;
     cl_mem dstBuffer = openCLCreateBuffer(clCxt, CL_MEM_WRITE_ONLY, dbsize * sizeof(int));
     arithmetic_countNonZero_run(src, dstBuffer, vlen, groupnum, kernelName);
 
@@ -1275,7 +1276,6 @@ int cv::ocl::countNonZero(const oclMat &src)
     for (int i = 0; i < dbsize; i++)
         nonzero += p[i];
 
-    delete[] p;
     openCLSafeCall(clReleaseMemObject(dstBuffer));
     return nonzero;
 }

From f5af3ab851195b337aef03451080d9182f867c7a Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Wed, 25 Sep 2013 16:18:04 +0400
Subject: [PATCH 17/17] changes in OpenCL matrix operations docs

---
 modules/ocl/doc/operations_on_matrices.rst | 333 ++++++++++-----------
 modules/ocl/include/opencv2/ocl/ocl.hpp    | 115 ++++---
 modules/ocl/src/matrix_operations.cpp      |  13 +-
 3 files changed, 222 insertions(+), 239 deletions(-)

diff --git a/modules/ocl/doc/operations_on_matrices.rst b/modules/ocl/doc/operations_on_matrices.rst
index e47e72092..39888e28c 100644
--- a/modules/ocl/doc/operations_on_matrices.rst
+++ b/modules/ocl/doc/operations_on_matrices.rst
@@ -7,29 +7,29 @@ ocl::oclMat::convertTo
 ----------------------
 Returns void
 
-.. ocv:function:: void ocl::oclMat::convertTo( oclMat &m, int rtype, double alpha = 1, double beta = 0 ) const
+.. ocv:function:: void ocl::oclMat::convertTo(oclMat &m, int rtype, double alpha = 1, double beta = 0) const
 
-    :param m: The destination matrix. If it does not have a proper size or type before the operation, it will be reallocated
+    :param m: the destination matrix. If it does not have a proper size or type before the operation, it will be reallocated.
 
-    :param rtype: The desired destination matrix type, or rather, the depth(since the number of channels will be the same with the source one). If rtype is negative, the destination matrix will have the same type as the source.
+    :param rtype: the desired destination matrix type, or rather, the depth (since the number of channels will be the same with the source one). If rtype is negative, the destination matrix will have the same type as the source.
 
-    :param alpha: must be default now
+    :param alpha: optional scale factor.
 
-    :param beta: must be default now
+    :param beta: optional delta added to the scaled values.
 
-The method converts source pixel values to the target datatype. saturate cast is applied in the end to avoid possible overflows. Supports CV_8UC1, CV_8UC4, CV_32SC1, CV_32SC4, CV_32FC1, CV_32FC4.
+The method converts source pixel values to the target datatype. Saturate cast is applied in the end to avoid possible overflows. Supports all data types.
 
 ocl::oclMat::copyTo
 -------------------
 Returns void
 
-.. ocv:function:: void ocl::oclMat::copyTo( oclMat &m, const oclMat &mask ) const
+.. ocv:function:: void ocl::oclMat::copyTo(oclMat &m, const oclMat &mask = oclMat()) const
 
-    :param m: The destination matrix. If it does not have a proper size or type before the operation, it will be reallocated
+    :param m: The destination matrix. If it does not have a proper size or type before the operation, it will be reallocated.
 
-    :param mask(optional): The operation mask. Its non-zero elements indicate, which matrix elements need to be copied
+    :param mask: The operation mask. Its non-zero elements indicate, which matrix elements need to be copied.
 
-Copies the matrix to another one. Supports CV_8UC1, CV_8UC4, CV_32SC1, CV_32SC4, CV_32FC1, CV_32FC4
+Copies the matrix to another one. Supports all data types.
 
 ocl::oclMat::setTo
 ------------------
@@ -37,171 +37,163 @@ Returns oclMat
 
 .. ocv:function:: oclMat& ocl::oclMat::setTo(const Scalar &s, const oclMat &mask = oclMat())
 
-    :param s: Assigned scalar, which is converted to the actual array type
+    :param s: Assigned scalar, which is converted to the actual array type.
 
-    :param mask: The operation mask of the same size as ``*this``
+    :param mask: The operation mask of the same size as ``*this`` and type ``CV_8UC1``.
 
-Sets all or some of the array elements to the specified value. This is the advanced variant of Mat::operator=(const Scalar s) operator. Supports CV_8UC1, CV_8UC4, CV_32SC1, CV_32SC4, CV_32FC1, CV_32FC4.
+Sets all or some of the array elements to the specified value. This is the advanced variant of Mat::operator=(const Scalar s) operator. Supports all data types.
 
 ocl::absdiff
 ------------------
 Returns void
 
-.. ocv:function:: void ocl::absdiff( const oclMat& a, const oclMat& b, oclMat& c )
+.. ocv:function:: void ocl::absdiff(const oclMat& src1, const oclMat& src2, oclMat& dst)
 
-.. ocv:function:: void ocl::absdiff( const oclMat& a, const Scalar& s, oclMat& c )
+.. ocv:function:: void ocl::absdiff(const oclMat& src1, const Scalar& s, oclMat& dst)
 
+    :param src1: the first input array.
 
-    :param a: The first input array
+    :param src2: the second input array, must be the same size and same type as ``src1``.
 
-    :param b: The second input array, must be the same size and same type as a
+    :param s: scalar, the second input parameter.
 
-    :param s: Scalar, the second input parameter
+    :param dst: the destination array, it will have the same size and same type as ``src1``.
 
-    :param c: The destination array, it will have the same size and same type as a
-
-Computes per-element absolute difference between two arrays or between array and a scalar. Supports all data types except CV_8S.
+Computes per-element absolute difference between two arrays or between array and a scalar. Supports all data types.
 
 ocl::add
 ------------------
 Returns void
 
-.. ocv:function:: void ocl::add( const oclMat & a, const oclMat & b, oclMat & c )
+.. ocv:function:: void ocl::add(const oclMat & src1, const oclMat & src2, oclMat & dst, const oclMat & mask = oclMat())
 
-.. ocv:function:: void ocl::add( const oclMat & a, const oclMat & b, oclMat & c, const oclMat & mask )
+.. ocv:function:: void ocl::add(const oclMat & src1, const Scalar & s, oclMat & dst, const oclMat & mask = oclMat())
 
-.. ocv:function:: void ocl::add( const oclMat & a, const Scalar & sc, oclMat & c, const oclMat & mask=oclMat() )
+    :param src1: the first input array.
 
-    :param a: The first input array
+    :param src2: the second input array, must be the same size and same type as ``src1``.
 
-    :param b: The second input array, must be the same size and same type as src1
+    :param s: scalar, the second input parameter
 
-    :param sc: Scalar, the second input parameter
+    :param dst: the destination array, it will have the same size and same type as ``src1``.
 
-    :param c: The destination array, it will have the same size and same type as src1
+    :param mask: the optional operation mask, 8-bit single channel array; specifies elements of the destination array to be changed.
 
-    :param mask: he optional operation mask, 8-bit single channel array; specifies elements of the destination array to be changed
-
-Computes per-element additon between two arrays or between array and a scalar. Supports all data types except CV_8S.
+Computes per-element additon between two arrays or between array and a scalar. Supports all data types.
 
 ocl::subtract
 ------------------
 Returns void
 
-.. ocv:function:: void ocl::subtract( const oclMat& a, const oclMat& b, oclMat& c )
+.. ocv:function:: void ocl::subtract(const oclMat& src1, const oclMat& src2, oclMat& dst, const oclMat& mask = oclMat())
 
-.. ocv:function:: void ocl::subtract( const oclMat& a, const oclMat& b, oclMat& c, const oclMat& mask )
+.. ocv:function:: void ocl::subtract(const oclMat& src1, const Scalar& s, oclMat& dst, const oclMat& mask = oclMat())
 
-.. ocv:function:: void ocl::subtract( const oclMat& a, const Scalar& sc, oclMat& c, const oclMat& mask=oclMat() )
+    :param src1: the first input array.
 
-.. ocv:function:: void ocl::subtract( const Scalar& sc, const oclMat& a, oclMat& c, const oclMat& mask=oclMat() )
+    :param src2: the second input array, must be the same size and same type as ``src1``.
 
+    :param s: scalar, the second input parameter.
 
-    :param a: The first input array
+    :param dst: the destination array, it will have the same size and same type as ``src1``.
 
-    :param b: The second input array, must be the same size and same type as src1
+    :param mask: the optional operation mask, 8-bit single channel array; specifies elements of the destination array to be changed.
 
-    :param sc: Scalar, the second input parameter
-
-    :param c: The destination array, it will have the same size and same type as src1
-
-    :param mask: he optional operation mask, 8-bit single channel array; specifies elements of the destination array to be changed
-
-Computes per-element subtract between two arrays or between array and a scalar. Supports all data types except CV_8S.
+Computes per-element subtract between two arrays or between array and a scalar. Supports all data types.
 
 ocl::multiply
 ------------------
 Returns void
 
-.. ocv:function:: void ocl::multiply( const oclMat& a, const oclMat& b, oclMat& c, double scale=1 )
+.. ocv:function:: void ocl::multiply(const oclMat& src1, const oclMat& src2, oclMat& dst, double scale = 1)
 
-    :param a: The first input array
+    :param src1: the first input array.
 
-    :param b: The second input array, must be the same size and same type as src1
+    :param src2: the second input array, must be the same size and same type as ``src1``.
 
-    :param c: The destination array, it will have the same size and same type as src1
+    :param dst: the destination array, it will have the same size and same type as ``src1``.
 
-    :param scale: must be 1 now
+    :param scale: optional scale factor.
 
-Computes per-element multiply between two arrays or between array and a scalar. Supports all data types except CV_8S.
+Computes per-element multiply between two arrays or between array and a scalar. Supports all data types.
 
 ocl::divide
 ------------------
 Returns void
 
-.. ocv:function:: void ocl::divide( const oclMat& a, const oclMat& b, oclMat& c, double scale=1 )
+.. ocv:function:: void ocl::divide(const oclMat& src1, const oclMat& src2, oclMat& dst, double scale = 1)
 
-.. ocv:function:: void ocl::divide( double scale, const oclMat& b, oclMat& c )
+.. ocv:function:: void ocl::divide(double scale, const oclMat& src1, oclMat& dst)
 
-    :param a: The first input array
+    :param src1: the first input array.
 
-    :param b: The second input array, must be the same size and same type as src1
+    :param src2: the second input array, must be the same size and same type as ``src1``.
 
-    :param c: The destination array, it will have the same size and same type as src1
+    :param dst: the destination array, it will have the same size and same type as ``src1``.
 
-    :param scale: must be 1 now
+    :param scale: scalar factor.
 
-Computes per-element divide between two arrays or between array and a scalar. Supports all data types except CV_8S.
+Computes per-element divide between two arrays or between array and a scalar. Supports all data types.
 
 ocl::bitwise_and
 ------------------
 Returns void
 
-.. ocv:function:: void ocl::bitwise_and( const oclMat& src1, const oclMat& src2, oclMat& dst, const oclMat& mask=oclMat() )
+.. ocv:function:: void ocl::bitwise_and(const oclMat& src1, const oclMat& src2, oclMat& dst, const oclMat& mask = oclMat())
 
-.. ocv:function:: void ocl::bitwise_and( const oclMat& src1, const Scalar& s, oclMat& dst, const oclMat& mask=oclMat() )
+.. ocv:function:: void ocl::bitwise_and(const oclMat& src1, const Scalar& s, oclMat& dst, const oclMat& mask = oclMat())
 
-    :param src1: The first input array
+    :param src1: the first input array.
 
-    :param src2: The second input array, must be the same size and same type as src1
+    :param src2: the second input array, must be the same size and same type as ``src1``.
 
-    :param s: Scalar, the second input parameter
+    :param s: scalar, the second input parameter.
 
-    :param dst: The destination array, it will have the same size and same type as src1
+    :param dst: the destination array, it will have the same size and same type as ``src1``.
 
-    :param mask: The optional operation mask, 8-bit single channel array; specifies elements of the destination array to be changed
+    :param mask: the optional operation mask, 8-bit single channel array; specifies elements of the destination array to be changed.
 
-Computes per-element bitwise_and between two arrays or between array and a scalar. Supports all data types except CV_8S.
+Computes per-element bitwise_and between two arrays or between array and a scalar. Supports all data types.
 
 ocl::bitwise_or
 ------------------
 Returns void
 
-.. ocv:function:: void ocl::bitwise_or( const oclMat& src1, const oclMat& src2, oclMat& dst, const oclMat& mask=oclMat() )
+.. ocv:function:: void ocl::bitwise_or(const oclMat& src1, const oclMat& src2, oclMat& dst, const oclMat& mask = oclMat())
 
-.. ocv:function:: void ocl::bitwise_or( const oclMat& src1, const Scalar& s, oclMat& dst, const oclMat& mask=oclMat() )
+.. ocv:function:: void ocl::bitwise_or(const oclMat& src1, const Scalar& s, oclMat& dst, const oclMat& mask = oclMat())
 
-    :param src1: The first input array
+    :param src1: the first input array.
 
-    :param src2: The second input array, must be the same size and same type as src1
+    :param src2: the second input array, must be the same size and same type as ``src1``.
 
-    :param s: Scalar, the second input parameter
+    :param s: scalar, the second input parameter.
 
-    :param dst: The destination array, it will have the same size and same type as src1
+    :param dst: the destination array, it will have the same size and same type as ``src1``.
 
-    :param mask: The optional operation mask, 8-bit single channel array; specifies elements of the destination array to be changed
+    :param mask: the optional operation mask, 8-bit single channel array; specifies elements of the destination array to be changed.
 
-Computes per-element bitwise_or between two arrays or between array and a scalar. Supports all data types except CV_8S.
+Computes per-element bitwise_or between two arrays or between array and a scalar. Supports all data types.
 
 ocl::bitwise_xor
 ------------------
 Returns void
 
-.. ocv:function:: void ocl::bitwise_xor( const oclMat& src1, const oclMat& src2, oclMat& dst, const oclMat& mask=oclMat() )
+.. ocv:function:: void ocl::bitwise_xor(const oclMat& src1, const oclMat& src2, oclMat& dst, const oclMat& mask = oclMat())
 
-.. ocv:function:: void ocl::bitwise_xor( const oclMat& src1, const Scalar& s, oclMat& dst, const oclMat& mask=oclMat() )
+.. ocv:function:: void ocl::bitwise_xor(const oclMat& src1, const Scalar& s, oclMat& dst, const oclMat& mask = oclMat())
 
-    :param src1: The first input array
+    :param src1: the first input array.
 
-    :param src2: The second input array, must be the same size and same type as src1
+    :param src2: the second input array, must be the same size and same type as ``src1``.
 
-    :param sc: Scalar, the second input parameter
+    :param sc: scalar, the second input parameter.
 
-    :param dst: The destination array, it will have the same size and same type as src1
+    :param dst: the destination array, it will have the same size and same type as ``src1``.
 
-    :param mask: The optional operation mask, 8-bit single channel array; specifies elements of the destination array to be changed
+    :param mask: the optional operation mask, 8-bit single channel array; specifies elements of the destination array to be changed.
 
-Computes per-element bitwise_xor between two arrays or between array and a scalar. Supports all data types except CV_8S.
+Computes per-element bitwise_xor between two arrays or between array and a scalar. Supports all data types.
 
 ocl::bitwise_not
 ------------------
@@ -209,11 +201,11 @@ Returns void
 
 .. ocv:function:: void ocl::bitwise_not(const oclMat &src, oclMat &dst)
 
-    :param src: The input array
+    :param src: the input array.
 
-    :param dst: The destination array, it will have the same size and same type as src1
+    :param dst: the destination array, it will have the same size and same type as ``src``.
 
-The functions bitwise not compute per-element bit-wise inversion of the source array:. Supports all data types except CV_8S.
+The functions bitwise not compute per-element bit-wise inversion of the source array. Supports all data types.
 
 ocl::cartToPolar
 ------------------
@@ -221,17 +213,17 @@ Returns void
 
 .. ocv:function:: void ocl::cartToPolar(const oclMat &x, const oclMat &y, oclMat &magnitude, oclMat &angle, bool angleInDegrees = false)
 
-    :param x: The array of x-coordinates; must be single-precision or double-precision floating-point array
+    :param x: the array of x-coordinates; must be single-precision or double-precision floating-point array.
 
-    :param y: The array of y-coordinates; it must have the same size and same type as x
+    :param y: the array of y-coordinates; it must have the same size and same type as ``x``.
 
-    :param magnitude: The destination array of magnitudes of the same size and same type as x
+    :param magnitude: the destination array of magnitudes of the same size and same type as ``x``.
 
-    :param angle: The destination array of angles of the same size and same type as x. The angles are measured in radians (0 to 2pi ) or in degrees (0 to 360 degrees).
+    :param angle: the destination array of angles of the same size and same type as ``x``. The angles are measured in radians (0 to 2pi) or in degrees (0 to 360 degrees).
 
-    :param angleInDegrees: The flag indicating whether the angles are measured in radians, which is default mode, or in degrees
+    :param angleInDegrees: the flag indicating whether the angles are measured in radians, which is default mode, or in degrees.
 
-Calculates the magnitude and angle of 2d vectors. Supports only CV_32F and CV_64F data types.
+Calculates the magnitude and angle of 2D vectors. Supports only ``CV_32F`` and ``CV_64F`` data types.
 
 ocl::polarToCart
 ------------------
@@ -239,57 +231,57 @@ Returns void
 
 .. ocv:function:: void ocl::polarToCart(const oclMat &magnitude, const oclMat &angle, oclMat &x, oclMat &y, bool angleInDegrees = false)
 
-    :param magnitude: The source floating-point array of magnitudes of 2D vectors. It can be an empty matrix (=Mat()) - in this case the function assumes that all the magnitudes are =1. If it's not empty, it must have the same size and same type as angle
+    :param magnitude: the source floating-point array of magnitudes of 2D vectors. It can be an empty matrix (=Mat()) - in this case the function assumes that all the magnitudes are = 1. If it's not empty, it must have the same size and same type as ``angle``.
 
-    :param angle: The source floating-point array of angles of the 2D vectors
+    :param angle: the source floating-point array of angles of the 2D vectors.
 
-    :param x: The destination array of x-coordinates of 2D vectors; will have the same size and the same type as angle
+    :param x: the destination array of x-coordinates of 2D vectors; will have the same size and the same type as ``angle``.
 
-    :param y: The destination array of y-coordinates of 2D vectors; will have the same size and the same type as angle
+    :param y: the destination array of y-coordinates of 2D vectors; will have the same size and the same type as ``angle``.
 
-    :param angleInDegrees: The flag indicating whether the angles are measured in radians, which is default mode, or in degrees
+    :param angleInDegrees: the flag indicating whether the angles are measured in radians, which is default mode, or in degrees.
 
-The function polarToCart computes the cartesian coordinates of each 2D vector represented by the corresponding elements of magnitude and angle. Supports only CV_32F and CV_64F data types.
+The function polarToCart computes the cartesian coordinates of each 2D vector represented by the corresponding elements of magnitude and angle. Supports only ``CV_32F`` and ``CV_64F`` data types.
 
 ocl::compare
 ------------------
 Returns void
 
-.. ocv:function:: void ocl::compare(const oclMat &a, const oclMat &b, oclMat &c, int cmpop)
+.. ocv:function:: void ocl::compare(const oclMat &src1, const oclMat &src2, oclMat &dst, int cmpop)
 
-    :param a: The first source array
+    :param src1: the first source array.
 
-    :param b: The second source array; must have the same size and same type as a
+    :param src2: the second source array; must have the same size and same type as ``src1``.
 
-    :param c: The destination array; will have the same size as a
+    :param dst: the destination array; will have the same size as ``src1`` and type ``CV_8UC1``.
 
-    :param cmpop: The flag specifying the relation between the elements to be checked
+    :param cmpop: the flag specifying the relation between the elements to be checked.
 
-Performs per-element comparison of two arrays or an array and scalar value. Supports all the 1 channel data types except CV_8S.
+Performs per-element comparison of two arrays or an array and scalar value. Supports all data types.
 
 ocl::exp
 ------------------
 Returns void
 
-.. ocv:function:: void ocl::exp(const oclMat &a, oclMat &b)
+.. ocv:function:: void ocl::exp(const oclMat &src, oclMat &dst)
 
-    :param a: The first source array
+    :param src: the first source array.
 
-    :param b: The dst array; must have the same size and same type as a
+    :param dst: the dst array; must have the same size and same type as ``src``.
 
-The function exp calculates the exponent of every element of the input array. Supports only CV_32FC1 data type.
+The function exp calculates the exponent of every element of the input array. Supports only ``CV_32FC1`` and ``CV_64F`` data types.
 
 ocl::log
 ------------------
 Returns void
 
-.. ocv:function:: void ocl::log(const oclMat &a, oclMat &b)
+.. ocv:function:: void ocl::log(const oclMat &src, oclMat &dst)
 
-    :param a: The first source array
+    :param src: the first source array.
 
-    :param b: The dst array; must have the same size and same type as a
+    :param dst: the dst array; must have the same size and same type as ``src``.
 
-The function log calculates the log of every element of the input array. Supports only CV_32FC1 data type.
+The function log calculates the log of every element of the input array. Supports only ``CV_32FC1`` and ``CV_64F`` data types.
 
 ocl::LUT
 ------------------
@@ -297,13 +289,13 @@ Returns void
 
 .. ocv:function:: void ocl::LUT(const oclMat &src, const oclMat &lut, oclMat &dst)
 
-    :param src: Source array of 8-bit elements
+    :param src: source array of 8-bit elements.
 
-    :param lut: Look-up table of 256 elements. In the case of multi-channel source array, the table should either have a single channel (in this case the same table is used for all channels) or the same number of channels as in the source array
+    :param lut: look-up table of 256 elements. In the case of multi-channel source array, the table should either have a single channel (in this case the same table is used for all channels) or the same number of channels as in the source array.
 
-    :param dst: Destination array; will have the same size and the same number of channels as src, and the same depth as lut
+    :param dst: destination array; will have the same size and the same number of channels as ``src``, and the same depth as ``lut``.
 
-Performs a look-up table transform of an array. Supports only CV_8UC1 and CV_8UC4 data type.
+Performs a look-up table transform of an array.
 
 ocl::magnitude
 ------------------
@@ -311,25 +303,25 @@ Returns void
 
 .. ocv:function:: void ocl::magnitude(const oclMat &x, const oclMat &y, oclMat &magnitude)
 
-    :param x: The floating-point array of x-coordinates of the vectors
+    :param x: the floating-point array of x-coordinates of the vectors.
 
-    :param y: he floating-point array of y-coordinates of the vectors; must have the same size as x
+    :param y: the floating-point array of y-coordinates of the vectors; must have the same size as ``x``.
 
-    :param magnitude: The destination array; will have the same size and same type as x
+    :param magnitude: the destination array; will have the same size and same type as ``x``.
 
-The function magnitude calculates magnitude of 2D vectors formed from the corresponding elements of x and y arrays. Supports only CV_32F and CV_64F data type.
+The function magnitude calculates magnitude of 2D vectors formed from the corresponding elements of ``x`` and ``y`` arrays. Supports only ``CV_32F`` and ``CV_64F`` data types.
 
 ocl::flip
 ------------------
 Returns void
 
-.. ocv:function:: void ocl::flip( const oclMat& a, oclMat& b, int flipCode )
+.. ocv:function:: void ocl::flip(const oclMat& src, oclMat& dst, int flipCode)
 
-    :param a: Source image.
+    :param src: source image.
 
-    :param b: Destination image
+    :param dst: destination image.
 
-    :param flipCode: Specifies how to flip the array: 0 means flipping around the x-axis, positive (e.g., 1) means flipping around y-axis, and negative (e.g., -1) means flipping around both axes.
+    :param flipCode: specifies how to flip the array: 0 means flipping around the x-axis, positive (e.g., 1) means flipping around y-axis, and negative (e.g., -1) means flipping around both axes.
 
 The function flip flips the array in one of three different ways (row and column indices are 0-based). Supports all data types.
 
@@ -339,13 +331,13 @@ Returns void
 
 .. ocv:function:: void ocl::meanStdDev(const oclMat &mtx, Scalar &mean, Scalar &stddev)
 
-    :param mtx: Source image.
+    :param mtx: source image.
 
-    :param mean: The output parameter: computed mean value
+    :param mean: the output parameter: computed mean value.
 
-    :param stddev: The output parameter: computed standard deviation
+    :param stddev: the output parameter: computed standard deviation.
 
-The functions meanStdDev compute the mean and the standard deviation M of array elements, independently for each channel, and return it via the output parameters. Supports all data types except CV_32F,CV_64F
+The functions meanStdDev compute the mean and the standard deviation M of array elements, independently for each channel, and return it via the output parameters. Supports all data types except ``CV_32F``, ``CV_64F``.
 
 ocl::merge
 ------------------
@@ -353,9 +345,9 @@ Returns void
 
 .. ocv:function:: void ocl::merge(const vector<oclMat> &src, oclMat &dst)
 
-    :param src: The source array or vector of the single-channel matrices to be merged. All the matrices in src must have the same size and the same type
+    :param src: The source array or vector of the single-channel matrices to be merged. All the matrices in src must have the same size and the same type.
 
-    :param dst: The destination array; will have the same size and the same depth as src, the number of channels will match the number of source matrices
+    :param dst: The destination array; will have the same size and the same depth as src, the number of channels will match the number of source matrices.
 
 Composes a multi-channel array from several single-channel arrays. Supports all data types.
 
@@ -379,13 +371,13 @@ Returns the calculated norm
 
 .. ocv:function:: double ocl::norm(const oclMat &src1, const oclMat &src2, int normType = NORM_L2)
 
-    :param src1: The first source array
+    :param src1: the first source array.
 
-    :param src2: The second source array of the same size and the same type as src1
+    :param src2: the second source array of the same size and the same type as ``src1``.
 
-    :param normType: Type of the norm
+    :param normType: type of the norm.
 
-Calculates absolute array norm, absolute difference norm, or relative difference norm. Supports only CV_8UC1 data type.
+Calculates absolute array norm, absolute difference norm, or relative difference norm. Supports only ``CV_8UC1`` data type.
 
 ocl::phase
 ------------------
@@ -393,15 +385,15 @@ Returns void
 
 .. ocv:function:: void ocl::phase(const oclMat &x, const oclMat &y, oclMat &angle, bool angleInDegrees = false)
 
-    :param x: The source floating-point array of x-coordinates of 2D vectors
+    :param x: the source floating-point array of x-coordinates of 2D vectors
 
-    :param y: The source array of y-coordinates of 2D vectors; must have the same size and the same type as x
+    :param y: the source array of y-coordinates of 2D vectors; must have the same size and the same type as ``x``.
 
-    :param angle: The destination array of vector angles; it will have the same size and same type as x
+    :param angle: the destination array of vector angles; it will have the same size and same type as ``x``.
 
-    :param angleInDegrees: When it is true, the function will compute angle in degrees, otherwise they will be measured in radians
+    :param angleInDegrees: when it is true, the function will compute angle in degrees, otherwise they will be measured in radians.
 
-The function phase computes the rotation angle of each 2D vector that is formed from the corresponding elements of x and y. Supports only CV_32FC1 and CV_64FC1 data type.
+The function phase computes the rotation angle of each 2D vector that is formed from the corresponding elements of ``x`` and ``y``. Supports only ``CV_32FC1`` and ``CV_64FC1`` data type.
 
 ocl::pow
 ------------------
@@ -409,13 +401,13 @@ Returns void
 
 .. ocv:function:: void ocl::pow(const oclMat &x, double p, oclMat &y)
 
-    :param x: The source array
+    :param x: the source array.
 
-    :param power: The exponent of power;The source floating-point array of angles of the 2D vectors
+    :param p: the exponent of power; the source floating-point array of angles of the 2D vectors.
 
-    :param y: The destination array, should be the same type as the source
+    :param y: the destination array, should be the same type as the source.
 
-The function pow raises every element of the input array to p. Supports only CV_32FC1 and CV_64FC1 data type.
+The function pow raises every element of the input array to ``p``. Supports only ``CV_32FC1`` and ``CV_64FC1`` data types.
 
 ocl::transpose
 ------------------
@@ -423,26 +415,26 @@ Returns void
 
 .. ocv:function:: void ocl::transpose(const oclMat &src, oclMat &dst)
 
-    :param src: The source array
+    :param src: the source array.
 
-    :param dst: The destination array of the same type as src
+    :param dst: the destination array of the same type as ``src``.
 
-Transposes a matrix. Supports 8UC1, 8UC4, 8SC4, 16UC2, 16SC2, 32SC1 and 32FC1 data types.
+Transposes a matrix (in case when ``src`` == ``dst`` and matrix is square the operation are performed inplace)
 
 
 ocl::dft
 ------------
 Performs a forward or inverse discrete Fourier transform (1D or 2D) of the floating point matrix.
 
-.. ocv:function:: void ocl::dft( const oclMat& src, oclMat& dst, Size dft_size=Size(0, 0), int flags=0 )
+.. ocv:function:: void ocl::dft(const oclMat& src, oclMat& dst, Size dft_size = Size(), int flags = 0)
 
-    :param src: Source matrix (real or complex).
+    :param src: source matrix (real or complex).
 
-    :param dst: Destination matrix (real or complex).
+    :param dst: destination matrix (real or complex).
 
-    :param dft_size: Size of original input, which is used for transformation from complex to real.
+    :param dft_size: size of original input, which is used for transformation from complex to real.
 
-    :param flags: Optional flags:
+    :param flags: optional flags:
 
         * **DFT_ROWS** transforms each individual row of the source matrix.
 
@@ -452,9 +444,9 @@ Performs a forward or inverse discrete Fourier transform (1D or 2D) of the float
 
         * **DFT_REAL_OUTPUT** specifies the output as real. The source matrix is the result of real-complex transform, so the destination matrix must be real.
 
-Use to handle real matrices ( ``CV32FC1`` ) and complex matrices in the interleaved format ( ``CV32FC2`` ).
+Use to handle real matrices (``CV_32FC1``) and complex matrices in the interleaved format (``CV_32FC2``).
 
-The dft_size must be powers of 2, 3 and 5. Real to complex dft output is not the same with cpu version. real to complex and complex to real does not support DFT_ROWS
+The ``dft_size`` must be powers of ``2``, ``3`` and ``5``. Real to complex dft output is not the same with cpu version. Real to complex and complex to real does not support ``DFT_ROWS``.
 
 .. seealso:: :ocv:func:`dft`
 
@@ -464,22 +456,22 @@ Performs generalized matrix multiplication.
 
 .. ocv:function:: void ocl::gemm(const oclMat& src1, const oclMat& src2, double alpha, const oclMat& src3, double beta, oclMat& dst, int flags = 0)
 
-    :param src1: First multiplied input matrix that should be ``CV_32FC1`` type.
+    :param src1: first multiplied input matrix that should be ``CV_32FC1`` type.
 
-    :param src2: Second multiplied input matrix of the same type as  ``src1`` .
+    :param src2: second multiplied input matrix of the same type as ``src1``.
 
-    :param alpha: Weight of the matrix product.
+    :param alpha: weight of the matrix product.
 
-    :param src3: Third optional delta matrix added to the matrix product. It should have the same type as  ``src1``  and  ``src2`` .
+    :param src3: third optional delta matrix added to the matrix product. It should have the same type as ``src1`` and ``src2``.
 
-    :param beta: Weight of  ``src3`` .
+    :param beta: weight of ``src3``.
 
-    :param dst: Destination matrix. It has the proper size and the same type as input matrices.
+    :param dst: destination matrix. It has the proper size and the same type as input matrices.
 
-    :param flags: Operation flags:
+    :param flags: operation flags:
 
-            * **GEMM_1_T** transpose  ``src1``
-            * **GEMM_2_T** transpose  ``src2``
+            * **GEMM_1_T** transpose ``src1``.
+            * **GEMM_2_T** transpose ``src2``.
 
 .. seealso:: :ocv:func:`gemm`
 
@@ -489,28 +481,29 @@ Returns void
 
 .. ocv:function:: void ocl::sortByKey(oclMat& keys, oclMat& values, int method, bool isGreaterThan = false)
 
-    :param keys:   The keys to be used as sorting indices.
+    :param keys: the keys to be used as sorting indices.
 
-    :param values: The array of values.
+    :param values: the array of values.
 
-    :param isGreaterThan: Determine sorting order.
+    :param isGreaterThan: determine sorting order.
 
     :param method: supported sorting methods:
-            * **SORT_BITONIC**   bitonic sort, only support power-of-2 buffer size
-            * **SORT_SELECTION** selection sort, currently cannot sort duplicate keys
-            * **SORT_MERGE**     merge sort
-            * **SORT_RADIX**     radix sort, only support signed int/float keys(``CV_32S``/``CV_32F``)
+
+            * **SORT_BITONIC**   bitonic sort, only support power-of-2 buffer size.
+            * **SORT_SELECTION** selection sort, currently cannot sort duplicate keys.
+            * **SORT_MERGE**     merge sort.
+            * **SORT_RADIX**     radix sort, only support signed int/float keys(``CV_32S``/``CV_32F``).
 
 Returns the sorted result of all the elements in values based on equivalent keys.
 
-The element unit in the values to be sorted is determined from the data type,
-i.e., a ``CV_32FC2`` input ``{a1a2, b1b2}`` will be considered as two elements, regardless its matrix dimension.
+The element unit in the values to be sorted is determined from the data type, i.e., a ``CV_32FC2`` input ``{a1a2, b1b2}`` will be considered as two elements, regardless its matrix dimension.
 
 Both keys and values will be sorted inplace.
 
-Keys needs to be a **single** channel `oclMat`.
+Keys needs to be a **single** channel ``oclMat``.
 
 Example::
+
     input -
     keys   = {2,    3,   1}   (CV_8UC1)
     values = {10,5, 4,3, 6,2} (CV_8UC2)
diff --git a/modules/ocl/include/opencv2/ocl/ocl.hpp b/modules/ocl/include/opencv2/ocl/ocl.hpp
index d46ad503e..d3dbded34 100644
--- a/modules/ocl/include/opencv2/ocl/ocl.hpp
+++ b/modules/ocl/include/opencv2/ocl/ocl.hpp
@@ -268,13 +268,12 @@ namespace cv
 
             //! returns deep copy of the oclMatrix, i.e. the data is copied
             oclMat clone() const;
-            //! copies the oclMatrix content to "m".
+
+            //! copies those oclMatrix elements to "m" that are marked with non-zero mask elements.
             // It calls m.create(this->size(), this->type()).
             // It supports any data type
-            void copyTo( oclMat &m ) const;
-            //! copies those oclMatrix elements to "m" that are marked with non-zero mask elements.
-            //It supports 8UC1 8UC4 32SC1 32SC4 32FC1 32FC4
-            void copyTo( oclMat &m, const oclMat &mask ) const;
+            void copyTo( oclMat &m, const oclMat &mask = oclMat()) const;
+
             //! converts oclMatrix to another datatype with optional scalng. See cvConvertScale.
             //It supports 8UC1 8UC4 32SC1 32SC4 32FC1 32FC4
             void convertTo( oclMat &m, int rtype, double alpha = 1, double beta = 0 ) const;
@@ -410,57 +409,51 @@ namespace cv
 
         ////////////////////////////// Arithmetics ///////////////////////////////////
 
-        //#if defined DOUBLE_SUPPORT
-        //typedef double F;
-        //#else
-        //typedef float F;
-        //#endif
+        //! adds one matrix to another with scale (dst = src1 * alpha + src2 * beta + gama)
+        CV_EXPORTS void addWeighted(const oclMat &src1, double  alpha, const oclMat &src2, double beta, double gama, oclMat &dst);
 
-        //	CV_EXPORTS void addWeighted(const oclMat& a,F  alpha, const oclMat& b,F beta,F gama, oclMat& c);
-        CV_EXPORTS void addWeighted(const oclMat &a, double  alpha, const oclMat &b, double beta, double gama, oclMat &c);
+        //! adds one matrix to another (dst = src1 + src2)
+        // supports all data types
+        CV_EXPORTS void add(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask = oclMat());
+        //! adds scalar to a matrix (dst = src1 + s)
+        // supports all data types
+        CV_EXPORTS void add(const oclMat &src1, const Scalar &s, oclMat &dst, const oclMat &mask = oclMat());
 
-        //! adds one matrix to another (c = a + b)
-        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
-        CV_EXPORTS void add(const oclMat &a, const oclMat &b, oclMat &c, const oclMat &mask = oclMat());
-        //! adds scalar to a matrix (c = a + s)
-        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
-        CV_EXPORTS void add(const oclMat &a, const Scalar &sc, oclMat &c, const oclMat &mask = oclMat());
+        //! subtracts one matrix from another (dst = src1 - src2)
+        // supports all data types
+        CV_EXPORTS void subtract(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask = oclMat());
+        //! subtracts scalar from a matrix (dst = src1 - s)
+        // supports all data types
+        CV_EXPORTS void subtract(const oclMat &src1, const Scalar &s, oclMat &dst, const oclMat &mask = oclMat());
 
-        //! subtracts one matrix from another (c = a - b)
-        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
-        CV_EXPORTS void subtract(const oclMat &a, const oclMat &b, oclMat &c, const oclMat &mask = oclMat());
-        //! subtracts scalar from a matrix (c = a - s)
-        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
-        CV_EXPORTS void subtract(const oclMat &a, const Scalar &sc, oclMat &c, const oclMat &mask = oclMat());
-
-        //! computes element-wise product of the two arrays (c = a * b)
-        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
-        CV_EXPORTS void multiply(const oclMat &a, const oclMat &b, oclMat &c, double scale = 1);
+        //! computes element-wise product of the two arrays (dst = src1 * scale * src2)
+        // supports all data types
+        CV_EXPORTS void multiply(const oclMat &src1, const oclMat &src2, oclMat &dst, double scale = 1);
         //! multiplies matrix to a number (dst = scalar * src)
-        // supports CV_32FC1 only
+        // supports all data types
         CV_EXPORTS void multiply(double scalar, const oclMat &src, oclMat &dst);
 
-        //! computes element-wise quotient of the two arrays (c = a / b)
-        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
-        CV_EXPORTS void divide(const oclMat &a, const oclMat &b, oclMat &c, double scale = 1);
-        //! computes element-wise quotient of the two arrays (c = a / b)
-        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
-        CV_EXPORTS void divide(double scale, const oclMat &b, oclMat &c);
+        //! computes element-wise quotient of the two arrays (dst = src1 * scale / src2)
+        // supports all data types
+        CV_EXPORTS void divide(const oclMat &src1, const oclMat &src2, oclMat &dst, double scale = 1);
+        //! computes element-wise quotient of the two arrays (dst = scale / src)
+        // supports all data types
+        CV_EXPORTS void divide(double scale, const oclMat &src1, oclMat &dst);
 
-        //! compares elements of two arrays (c = a <cmpop> b)
-        // supports except CV_8SC1,CV_8SC2,CV8SC3,CV_8SC4 types
-        CV_EXPORTS void compare(const oclMat &a, const oclMat &b, oclMat &c, int cmpop);
+        //! compares elements of two arrays (dst = src1 <cmpop> src2)
+        // supports all data types
+        CV_EXPORTS void compare(const oclMat &src1, const oclMat &src2, oclMat &dst, int cmpop);
 
         //! transposes the matrix
-        // supports  CV_8UC1, 8UC4, 8SC4, 16UC2, 16SC2, 32SC1 and 32FC1.(the same as cuda)
+        // supports all data types
         CV_EXPORTS void transpose(const oclMat &src, oclMat &dst);
 
-        //! computes element-wise absolute difference of two arrays (c = abs(a - b))
-        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
-        CV_EXPORTS void absdiff(const oclMat &a, const oclMat &b, oclMat &c);
-        //! computes element-wise absolute difference of array and scalar (c = abs(a - s))
-        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
-        CV_EXPORTS void absdiff(const oclMat &a, const Scalar &s, oclMat &c);
+        //! computes element-wise absolute difference of two arrays (dst = abs(src1 - src2))
+        // supports all data types
+        CV_EXPORTS void absdiff(const oclMat &src1, const oclMat &src2, oclMat &dst);
+        //! computes element-wise absolute difference of array and scalar (dst = abs(src1 - s))
+        // supports all data types
+        CV_EXPORTS void absdiff(const oclMat &src1, const Scalar &s, oclMat &dst);
 
         //! computes mean value and standard deviation of all or selected array elements
         // supports except CV_32F,CV_64F
@@ -478,7 +471,7 @@ namespace cv
 
         //! reverses the order of the rows, columns or both in a matrix
         // supports all types
-        CV_EXPORTS void flip(const oclMat &a, oclMat &b, int flipCode);
+        CV_EXPORTS void flip(const oclMat &src, oclMat &dst, int flipCode);
 
         //! computes sum of array elements
         // disabled until fix crash
@@ -489,13 +482,11 @@ namespace cv
 
         //! finds global minimum and maximum array elements and returns their values
         // support all C1 types
-
         CV_EXPORTS void minMax(const oclMat &src, double *minVal, double *maxVal = 0, const oclMat &mask = oclMat());
         CV_EXPORTS void minMax_buf(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask, oclMat& buf);
 
         //! finds global minimum and maximum array elements and returns their values with locations
         // support all C1 types
-
         CV_EXPORTS void minMaxLoc(const oclMat &src, double *minVal, double *maxVal = 0, Point *minLoc = 0, Point *maxLoc = 0,
                                   const oclMat &mask = oclMat());
 
@@ -524,27 +515,27 @@ namespace cv
         //  This is not truly a bilateral filter. Instead of using user provided fixed parameters,
         //  the function calculates a constant at each window based on local standard deviation,
         //  and use this constant to do filtering.
-        //  supports 8UC1 8UC3
+        //  supports 8UC1, 8UC3
         CV_EXPORTS void adaptiveBilateralFilter(const oclMat& src, oclMat& dst, Size ksize, double sigmaSpace, Point anchor = Point(-1, -1), int borderType=BORDER_DEFAULT);
 
-        //! computes exponent of each matrix element (b = e**a)
-        // supports only CV_32FC1 type
-        CV_EXPORTS void exp(const oclMat &a, oclMat &b);
+        //! computes exponent of each matrix element (dst = e**src)
+        // supports only CV_32FC1, CV_64FC1 type
+        CV_EXPORTS void exp(const oclMat &src, oclMat &dst);
 
-        //! computes natural logarithm of absolute value of each matrix element: b = log(abs(a))
-        // supports only CV_32FC1 type
-        CV_EXPORTS void log(const oclMat &a, oclMat &b);
+        //! computes natural logarithm of absolute value of each matrix element: dst = log(abs(src))
+        // supports only CV_32FC1, CV_64FC1 type
+        CV_EXPORTS void log(const oclMat &src, oclMat &dst);
 
         //! computes magnitude of each (x(i), y(i)) vector
-        // supports only CV_32F CV_64F type
+        // supports only CV_32F, CV_64F type
         CV_EXPORTS void magnitude(const oclMat &x, const oclMat &y, oclMat &magnitude);
 
         //! computes angle (angle(i)) of each (x(i), y(i)) vector
-        // supports only CV_32F CV_64F type
+        // supports only CV_32F, CV_64F type
         CV_EXPORTS void phase(const oclMat &x, const oclMat &y, oclMat &angle, bool angleInDegrees = false);
 
         //! the function raises every element of tne input array to p
-        //! support only CV_32F CV_64F type
+        // support only CV_32F, CV_64F type
         CV_EXPORTS void pow(const oclMat &x, double p, oclMat &y);
 
         //! converts Cartesian coordinates to polar
@@ -558,14 +549,17 @@ namespace cv
         //! perfroms per-elements bit-wise inversion
         // supports all types
         CV_EXPORTS void bitwise_not(const oclMat &src, oclMat &dst);
+
         //! calculates per-element bit-wise disjunction of two arrays
         // supports all types
         CV_EXPORTS void bitwise_or(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask = oclMat());
         CV_EXPORTS void bitwise_or(const oclMat &src1, const Scalar &s, oclMat &dst, const oclMat &mask = oclMat());
+
         //! calculates per-element bit-wise conjunction of two arrays
         // supports all types
         CV_EXPORTS void bitwise_and(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask = oclMat());
         CV_EXPORTS void bitwise_and(const oclMat &src1, const Scalar &s, oclMat &dst, const oclMat &mask = oclMat());
+
         //! calculates per-element bit-wise "exclusive or" operation
         // supports all types
         CV_EXPORTS void bitwise_xor(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask = oclMat());
@@ -585,12 +579,13 @@ namespace cv
         CV_EXPORTS oclMatExpr operator / (const oclMat &src1, const oclMat &src2);
 
         //! computes convolution of two images
-        //! support only CV_32FC1 type
+        // support only CV_32FC1 type
         CV_EXPORTS void convolve(const oclMat &image, const oclMat &temp1, oclMat &result);
 
         CV_EXPORTS void cvtColor(const oclMat &src, oclMat &dst, int code , int dcn = 0);
 
         CV_EXPORTS void setIdentity(oclMat& src, double val);
+
         //////////////////////////////// Filter Engine ////////////////////////////////
 
         /*!
@@ -982,7 +977,7 @@ namespace cv
         // real to complex dft requires at least v1.8 clAmdFft
         // real to complex dft output is not the same with cpu version
         // real to complex and complex to real does not support DFT_ROWS
-        CV_EXPORTS void dft(const oclMat &src, oclMat &dst, Size dft_size = Size(0, 0), int flags = 0);
+        CV_EXPORTS void dft(const oclMat &src, oclMat &dst, Size dft_size = Size(), int flags = 0);
 
         //! implements generalized matrix product algorithm GEMM from BLAS
         // The functionality requires clAmdBlas library
diff --git a/modules/ocl/src/matrix_operations.cpp b/modules/ocl/src/matrix_operations.cpp
index ff52b8a55..78d1cd4af 100644
--- a/modules/ocl/src/matrix_operations.cpp
+++ b/modules/ocl/src/matrix_operations.cpp
@@ -347,19 +347,14 @@ static void copy_to_with_mask(const oclMat &src, oclMat &dst, const oclMat &mask
                         localThreads, args, -1, -1, compile_option);
 }
 
-void cv::ocl::oclMat::copyTo( oclMat &m ) const
-{
-    CV_DbgAssert(!this->empty());
-    m.create(size(), type());
-    openCLCopyBuffer2D(clCxt, m.data, m.step, m.offset,
-                       data, step, cols * elemSize(), rows, offset);
-}
-
 void cv::ocl::oclMat::copyTo( oclMat &mat, const oclMat &mask) const
 {
     if (mask.empty())
     {
-        copyTo(mat);
+        CV_DbgAssert(!this->empty());
+        mat.create(size(), type());
+        openCLCopyBuffer2D(clCxt, mat.data, mat.step, mat.offset,
+                           data, step, cols * elemSize(), rows, offset);
     }
     else
     {