From 5f81988699035ea16d1cdc7eb6367aaec520d0ee Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Thu, 3 Oct 2013 20:04:04 +0400
Subject: [PATCH] refactored arithm binary operations in order to make them
 more scalable

---
 modules/ocl/src/arithm.cpp                    |  16 +--
 modules/ocl/src/matrix_operations.cpp         |  32 ++---
 modules/ocl/src/opencl/arithm_add.cl          | 117 +++++++-----------
 modules/ocl/src/opencl/arithm_add_mask.cl     |  20 ++-
 modules/ocl/src/opencl/arithm_add_scalar.cl   |  63 ++++------
 .../ocl/src/opencl/arithm_add_scalar_mask.cl  |  20 ++-
 modules/ocl/test/test_arithm.cpp              |   2 +-
 7 files changed, 135 insertions(+), 135 deletions(-)

diff --git a/modules/ocl/src/arithm.cpp b/modules/ocl/src/arithm.cpp
index f34e0f730..09d250ae3 100644
--- a/modules/ocl/src/arithm.cpp
+++ b/modules/ocl/src/arithm.cpp
@@ -89,11 +89,11 @@ static void arithmetic_run_generic(const oclMat &src1, const oclMat &src2, const
     size_t localThreads[3]  = { 16, 16, 1 };
     size_t globalThreads[3] = { dst.cols, dst.rows, 1 };
 
-    std::string kernelName = op_type == ABS_DIFF ? "arithm_absdiff" : "arithm_binary_op";
+    std::string kernelName = "arithm_binary_op";
 
     const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
     const char * const WTypeMap[] = { "short", "short", "int", "int", "int", "float", "double" };
-    const char operationsMap[] = { '+', '-', '*', '/', '-' };
+    const char * const funcMap[] = { "FUNC_ADD", "FUNC_SUB", "FUNC_MUL", "FUNC_DIV", "FUNC_ABS_DIFF" };
     const char * const channelMap[] = { "", "", "2", "4", "4" };
     bool haveScalar = use_scalar || src2.empty();
 
@@ -105,12 +105,12 @@ static void arithmetic_run_generic(const oclMat &src1, const oclMat &src2, const
     else if (op_type == MUL)
         WDepth = hasDouble && (depth == CV_32S || depth == CV_64F) ? CV_64F : CV_32F;
 
-    std::string buildOptions = format("-D T=%s%s -D WT=%s%s -D convertToT=convert_%s%s%s -D Operation=%c"
-                                      " -D convertToWT=convert_%s%s",
+    std::string buildOptions = format("-D T=%s%s -D WT=%s%s -D convertToT=convert_%s%s%s -D %s "
+                                      "-D convertToWT=convert_%s%s",
                                       typeMap[depth], channelMap[oclChannels],
                                       WTypeMap[WDepth], channelMap[oclChannels],
                                       typeMap[depth], channelMap[oclChannels], (depth >= CV_32F ? "" : (depth == CV_32S ? "_rte" : "_sat_rte")),
-                                      operationsMap[op_type], WTypeMap[WDepth], channelMap[oclChannels]);
+                                      funcMap[op_type], WTypeMap[WDepth], channelMap[oclChannels]);
 
     vector<pair<size_t , const void *> > args;
     args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data ));
@@ -124,6 +124,9 @@ static void arithmetic_run_generic(const oclMat &src1, const oclMat &src2, const
         args.push_back( make_pair( sizeof(cl_int), (void *)&src2offset1 ));
 
         kernelName += "_mat";
+
+        if (haveScalar)
+            buildOptions += " -D HAVE_SCALAR";
     }
 
     if (haveScalar)
@@ -146,9 +149,6 @@ static void arithmetic_run_generic(const oclMat &src1, const oclMat &src2, const
         kernelName += "_mask";
     }
 
-    if (op_type == DIV)
-        kernelName += "_div";
-
     args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
     args.push_back( make_pair( sizeof(cl_int), (void *)&dststep1 ));
     args.push_back( make_pair( sizeof(cl_int), (void *)&dstoffset1 ));
diff --git a/modules/ocl/src/matrix_operations.cpp b/modules/ocl/src/matrix_operations.cpp
index 80b2f7d81..3b0e41731 100644
--- a/modules/ocl/src/matrix_operations.cpp
+++ b/modules/ocl/src/matrix_operations.cpp
@@ -366,23 +366,23 @@ static void set_to_withoutmask_run(const oclMat &dst, const Scalar &scalar, stri
 #ifdef CL_VERSION_1_2
     // this enables backwards portability to
     // run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support
-    if (Context::getContext()->supportsFeature(FEATURE_CL_VER_1_2) &&
-        dst.offset == 0 && dst.cols == dst.wholecols)
-    {
-        const int sizeofMap[][7] =
-            {
-                { sizeof(cl_uchar) , sizeof(cl_char) , sizeof(cl_ushort) , sizeof(cl_short) , sizeof(cl_int) , sizeof(cl_float) , sizeof(cl_double)  },
-                { sizeof(cl_uchar2), sizeof(cl_char2), sizeof(cl_ushort2), sizeof(cl_short2), sizeof(cl_int2), sizeof(cl_float2), sizeof(cl_double2) },
-                { 0                , 0               , 0                 , 0                , 0              , 0                ,  0                 },
-                { sizeof(cl_uchar4), sizeof(cl_char4), sizeof(cl_ushort4), sizeof(cl_short4), sizeof(cl_int4), sizeof(cl_float4), sizeof(cl_double4) },
-            };
-        int sizeofGeneric = sizeofMap[dst.oclchannels() - 1][dst.depth()];
+//    if (Context::getContext()->supportsFeature(Context::CL_VER_1_2) &&
+//        dst.offset == 0 && dst.cols == dst.wholecols)
+//    {
+//        const int sizeofMap[][7] =
+//            {
+//                { sizeof(cl_uchar) , sizeof(cl_char) , sizeof(cl_ushort) , sizeof(cl_short) , sizeof(cl_int) , sizeof(cl_float) , sizeof(cl_double)  },
+//                { sizeof(cl_uchar2), sizeof(cl_char2), sizeof(cl_ushort2), sizeof(cl_short2), sizeof(cl_int2), sizeof(cl_float2), sizeof(cl_double2) },
+//                { 0                , 0               , 0                 , 0                , 0              , 0                ,  0                 },
+//                { sizeof(cl_uchar4), sizeof(cl_char4), sizeof(cl_ushort4), sizeof(cl_short4), sizeof(cl_int4), sizeof(cl_float4), sizeof(cl_double4) },
+//            };
+//        int sizeofGeneric = sizeofMap[dst.oclchannels() - 1][dst.depth()];
 
-        clEnqueueFillBuffer(getClCommandQueue(dst.clCxt),
-                            (cl_mem)dst.data, (void*)mat.data, sizeofGeneric,
-                            0, dst.step * dst.rows, 0, NULL, NULL);
-    }
-    else
+//        clEnqueueFillBuffer((cl_command_queue)dst.clCxt->oclCommandQueue(),
+//                            (cl_mem)dst.data, (void*)mat.data, sizeofGeneric,
+//                            0, dst.step * dst.rows, 0, NULL, NULL);
+//    }
+//    else
 #endif
     {
         oclMat m(mat);
diff --git a/modules/ocl/src/opencl/arithm_add.cl b/modules/ocl/src/opencl/arithm_add.cl
index 38834e766..40caba5a9 100644
--- a/modules/ocl/src/opencl/arithm_add.cl
+++ b/modules/ocl/src/opencl/arithm_add.cl
@@ -52,14 +52,50 @@
 #endif
 #endif
 
+#if defined (FUNC_ADD)
+#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) + convertToWT(src2[src2_index]));
+#endif
+
+#if defined (FUNC_SUB)
+#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) - convertToWT(src2[src2_index]));
+#endif
+
+#if defined (FUNC_MUL)
+#if defined (HAVE_SCALAR)
+#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) * scalar[0] * convertToWT(src2[src2_index]));
+#else
+#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) * convertToWT(src2[src2_index]));
+#endif
+#endif
+
+#if defined (FUNC_DIV)
+#if defined (HAVE_SCALAR)
+#define EXPRESSION T zero = (T)(0); \
+    dst[dst_index] = src2[src2_index] == zero ? zero : \
+    convertToT(convertToWT(src1[src1_index]) * scalar[0] / convertToWT(src2[src2_index]));
+#else
+#define EXPRESSION T zero = (T)(0); \
+    dst[dst_index] = src2[src2_index] == zero ? zero : \
+    convertToT(convertToWT(src1[src1_index]) / convertToWT(src2[src2_index]));
+#endif
+#endif
+
+#if defined (FUNC_ABS_DIFF)
+#define EXPRESSION WT value = convertToWT(src1[src1_index]) - convertToWT(src2[src2_index]); \
+    value = value > (WT)(0) ? value : -value; \
+    dst[dst_index] = convertToT(value);
+#endif
+
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 ///////////////////////////////////////////// ADD ////////////////////////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 
+#ifndef HAVE_SCALAR
+
 __kernel void arithm_binary_op_mat(__global T *src1, int src1_step, int src1_offset,
-                         __global T *src2, int src2_step, int src2_offset,
-                         __global T *dst, int dst_step, int dst_offset,
-                         int cols, int rows)
+                                   __global T *src2, int src2_step, int src2_offset,
+                                   __global T *dst, int dst_step, int dst_offset,
+                                   int cols, int rows)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
@@ -70,55 +106,18 @@ __kernel void arithm_binary_op_mat(__global T *src1, int src1_step, int src1_off
         int src2_index = mad24(y, src2_step, x + src2_offset);
         int dst_index  = mad24(y, dst_step, x + dst_offset);
 
-        dst[dst_index] = convertToT(convertToWT(src1[src1_index]) Operation convertToWT(src2[src2_index]));
+        EXPRESSION
     }
 }
 
-__kernel void arithm_binary_op_mat_div(__global T *src1, int src1_step, int src1_offset,
-                         __global T *src2, int src2_step, int src2_offset,
-                         __global T *dst, int dst_step, int dst_offset,
-                         int cols, int rows)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
+#else
 
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, x + src1_offset);
-        int src2_index = mad24(y, src2_step, x + src2_offset);
-        int dst_index  = mad24(y, dst_step, x + dst_offset);
-
-        T zero = (T)(0);
-        dst[dst_index] = src2[src2_index] == zero ? zero : convertToT(convertToWT(src1[src1_index]) / convertToWT(src2[src2_index]));
-    }
-}
-
-__kernel void arithm_absdiff_mat(__global T *src1, int src1_step, int src1_offset,
-                         __global T *src2, int src2_step, int src2_offset,
-                         __global T *dst, int dst_step, int dst_offset,
-                         int cols, int rows)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, x + src1_offset);
-        int src2_index = mad24(y, src2_step, x + src2_offset);
-        int dst_index  = mad24(y, dst_step, x + dst_offset);
-
-        WT value = convertToWT(src1[src1_index]) - convertToWT(src2[src2_index]);
-        value = value > (WT)(0) ? value : -value;
-        dst[dst_index] = convertToT(value);
-    }
-}
-
-// add mat with scale for multiply
+// add mat with scale
 __kernel void arithm_binary_op_mat_scalar(__global T *src1, int src1_step, int src1_offset,
-                                __global T *src2, int src2_step, int src2_offset,
-                               __global WT *scalar,
-                               __global T *dst, int dst_step,  int dst_offset,
-                               int cols, int rows)
+                                          __global T *src2, int src2_step, int src2_offset,
+                                          __global WT *scalar,
+                                          __global T *dst, int dst_step,  int dst_offset,
+                                          int cols, int rows)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
@@ -129,28 +128,8 @@ __kernel void arithm_binary_op_mat_scalar(__global T *src1, int src1_step, int s
         int src2_index = mad24(y, src2_step, x + src2_offset);
         int dst_index = mad24(y, dst_step, x + dst_offset);
 
-        dst[dst_index] = convertToT(convertToWT(src1[src1_index]) * scalar[0] * convertToWT(src2[src2_index]));
+        EXPRESSION
     }
 }
 
-// add mat with scale for divide
-__kernel void arithm_binary_op_mat_scalar_div(__global T *src1, int src1_step, int src1_offset,
-                                __global T *src2, int src2_step, int src2_offset,
-                               __global WT *scalar,
-                               __global T *dst, int dst_step,  int dst_offset,
-                               int cols, int rows)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, x + src1_offset);
-        int src2_index = mad24(y, src2_step, x + src2_offset);
-        int dst_index = mad24(y, dst_step, x + dst_offset);
-
-        T zero = (T)(0);
-        dst[dst_index] = src2[src2_index] == zero ? zero :
-            convertToT(convertToWT(src1[src1_index]) * scalar[0] / convertToWT(src2[src2_index]));
-    }
-}
+#endif
diff --git a/modules/ocl/src/opencl/arithm_add_mask.cl b/modules/ocl/src/opencl/arithm_add_mask.cl
index 52dbfc455..c3958bf1f 100644
--- a/modules/ocl/src/opencl/arithm_add_mask.cl
+++ b/modules/ocl/src/opencl/arithm_add_mask.cl
@@ -51,6 +51,24 @@
 #endif
 #endif
 
+#if defined (FUNC_ADD)
+#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) + convertToWT(src2[src2_index]));
+#endif
+
+#if defined (FUNC_SUB)
+#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) - convertToWT(src2[src2_index]));
+#endif
+
+#if defined (FUNC_MUL)
+#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) * convertToWT(src2[src2_index]));
+#endif
+
+#if defined (FUNC_DIV)
+#define EXPRESSION T zero = (T)(0); \
+    dst[dst_index] = src2[src2_index] == zero ? zero : \
+    convertToT(convertToWT(src1[src1_index]) / convertToWT(src2[src2_index]));
+#endif
+
 //////////////////////////////////////////////////////////////////////////////////
 ///////////////////////////////// add with mask //////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////////
@@ -73,7 +91,7 @@ __kernel void arithm_binary_op_mat_mask(__global T * src1, int src1_step, int sr
             int src2_index = mad24(y, src2_step, x + src2_offset);
             int dst_index  = mad24(y, dst_step, dst_offset + x);
 
-            dst[dst_index] = convertToT(convertToWT(src1[src1_index]) Operation convertToWT(src2[src2_index]));
+            EXPRESSION
         }
     }
 }
diff --git a/modules/ocl/src/opencl/arithm_add_scalar.cl b/modules/ocl/src/opencl/arithm_add_scalar.cl
index 4e0c7fc5f..4a0167fd5 100644
--- a/modules/ocl/src/opencl/arithm_add_scalar.cl
+++ b/modules/ocl/src/opencl/arithm_add_scalar.cl
@@ -51,6 +51,29 @@
 #endif
 #endif
 
+#if defined (FUNC_ADD)
+#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) + scalar[0]);
+#endif
+
+#if defined (FUNC_SUB)
+#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) - scalar[0]);
+#endif
+
+#if defined (FUNC_MUL)
+#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) * scalar[0]);
+#endif
+
+#if defined (FUNC_DIV)
+#define EXPRESSION T zero = (T)(0); \
+    dst[dst_index] = src1[src1_index] == zero ? zero : convertToT(scalar[0] / convertToWT(src1[src1_index]));
+#endif
+
+#if defined (FUNC_ABS_DIFF)
+#define EXPRESSION WT value = convertToWT(src1[src1_index]) - scalar[0]; \
+    value = value > (WT)(0) ? value : -value; \
+    dst[dst_index] = convertToT(value);
+#endif
+
 ///////////////////////////////////////////////////////////////////////////////////
 ///////////////////////////////// Add with scalar /////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////
@@ -68,44 +91,6 @@ __kernel void arithm_binary_op_scalar (__global T *src1, int src1_step, int src1
         int src1_index = mad24(y, src1_step, x + src1_offset);
         int dst_index = mad24(y, dst_step, x + dst_offset);
 
-        dst[dst_index] = convertToT(convertToWT(src1[src1_index]) Operation scalar[0]);
-    }
-}
-
-__kernel void arithm_absdiff_scalar(__global T *src1, int src1_step, int src1_offset,
-                         __global WT *src2,
-                         __global T *dst, int dst_step, int dst_offset,
-                         int cols, int rows)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, x + src1_offset);
-        int dst_index  = mad24(y, dst_step, x + dst_offset);
-
-        WT value = convertToWT(src1[src1_index]) - src2[0];
-        value = value > (WT)(0) ? value : -value;
-        dst[dst_index] = convertToT(value);
-    }
-}
-
-// scalar divide to matrix
-__kernel void arithm_binary_op_scalar_div(__global T *src1, int src1_step, int src1_offset,
-                               __global WT *scalar,
-                               __global T *dst,  int dst_step,  int dst_offset,
-                               int cols, int rows)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, x + src1_offset);
-        int dst_index = mad24(y, dst_step, x + dst_offset);
-
-        T zero = (T)(0);
-        dst[dst_index] = src1[src1_index] == zero ? zero : convertToT(scalar[0] / convertToWT(src1[src1_index]));
+        EXPRESSION
     }
 }
diff --git a/modules/ocl/src/opencl/arithm_add_scalar_mask.cl b/modules/ocl/src/opencl/arithm_add_scalar_mask.cl
index 5c3408034..d472b3cbf 100644
--- a/modules/ocl/src/opencl/arithm_add_scalar_mask.cl
+++ b/modules/ocl/src/opencl/arithm_add_scalar_mask.cl
@@ -51,6 +51,24 @@
 #endif
 #endif
 
+#if defined (FUNC_ADD)
+#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) + scalar[0]);
+#endif
+
+#if defined (FUNC_SUB)
+#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) - scalar[0]);
+#endif
+
+#if defined (FUNC_MUL)
+#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) * scalar[0]);
+#endif
+
+#if defined (FUNC_DIV)
+#define EXPRESSION T zero = (T)(0); \
+    dst[dst_index] = src2[src2_index] == zero ? zero : \
+    convertToT(convertToWT(src1[src1_index]) / scalar[0]);
+#endif
+
 ///////////////////////////////////////////////////////////////////////////////////
 //////////////////////////// Add with scalar with mask ////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////
@@ -72,7 +90,7 @@ __kernel void arithm_binary_op_scalar_mask(__global T *src1, int src1_step, int
             int src1_index = mad24(y, src1_step, x + src1_offset);
             int dst_index = mad24(y, dst_step, dst_offset + x);
 
-            dst[dst_index] = convertToT(convertToWT(src1[src1_index]) Operation scalar[0]);
+            EXPRESSION
         }
     }
 }
diff --git a/modules/ocl/test/test_arithm.cpp b/modules/ocl/test/test_arithm.cpp
index db01d9503..f2f13ec41 100644
--- a/modules/ocl/test/test_arithm.cpp
+++ b/modules/ocl/test/test_arithm.cpp
@@ -535,7 +535,7 @@ TEST_P(Absdiff, Mat)
     }
 }
 
-TEST_P(Absdiff, Mat_Scalar)
+TEST_P(Absdiff, Scalar)
 {
     for (int j = 0; j < LOOP_TIMES; j++)
     {