From 1f7f9c96821267559da29a04a2ef23fd79faf072 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Mon, 28 Oct 2013 18:04:34 +0400
Subject: [PATCH] rewrote and generalized ocl::threshold

---
 modules/ocl/src/imgproc.cpp                 |  96 +++++++--------
 modules/ocl/src/opencl/imgproc_threshold.cl | 122 ++++----------------
 modules/ocl/test/test_imgproc.cpp           |   4 +-
 3 files changed, 69 insertions(+), 153 deletions(-)
diff --git a/modules/ocl/src/imgproc.cpp b/modules/ocl/src/imgproc.cpp
index a2c685496..8ae9c643d 100644
--- a/modules/ocl/src/imgproc.cpp
+++ b/modules/ocl/src/imgproc.cpp
@@ -98,80 +98,66 @@ namespace cv
         /////////////////////////////////////////////////////////////////////////////////////
         // threshold
 
-        typedef void (*gpuThresh_t)(const oclMat &src, oclMat &dst, double thresh, double maxVal, int type);
-
-        static void threshold_8u(const oclMat &src, oclMat &dst, double thresh, double maxVal, int type)
+        static std::vector<uchar> scalarToVector(const cv::Scalar & sc, int depth, int ocn, int cn)
         {
-            uchar thresh_uchar = cvFloor(thresh);
-            uchar max_val = cvRound(maxVal);
+            CV_Assert(ocn == cn || (ocn == 4 && cn == 3));
 
-            size_t cols = (dst.cols + (dst.offset % 16) + 15) / 16;
-            size_t bSizeX = 16, bSizeY = 16;
-            size_t gSizeX = cols % bSizeX == 0 ? cols : (cols + bSizeX - 1) / bSizeX * bSizeX;
-            size_t gSizeY = dst.rows;
-            size_t globalThreads[3] = {gSizeX, gSizeY, 1};
-            size_t localThreads[3] = {bSizeX, bSizeY, 1};
+            static const int sizeMap[] = { sizeof(uchar), sizeof(char), sizeof(ushort),
+                                       sizeof(short), sizeof(int), sizeof(float), sizeof(double) };
 
-            vector< pair<size_t, const void *> > args;
-            args.push_back( make_pair(sizeof(cl_mem), &src.data));
-            args.push_back( make_pair(sizeof(cl_mem), &dst.data));
-            args.push_back( make_pair(sizeof(cl_int), (void *)&src.offset));
-            args.push_back( make_pair(sizeof(cl_int), (void *)&src.step));
-            args.push_back( make_pair(sizeof(cl_int), (void *)&dst.offset));
-            args.push_back( make_pair(sizeof(cl_int), (void *)&dst.rows));
-            args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols));
-            args.push_back( make_pair(sizeof(cl_int), (void *)&dst.step));
-            args.push_back( make_pair(sizeof(cl_uchar), (void *)&thresh_uchar));
-            args.push_back( make_pair(sizeof(cl_uchar), (void *)&max_val));
-            args.push_back( make_pair(sizeof(cl_int), (void *)&type));
-            openCLExecuteKernel(src.clCxt, &imgproc_threshold, "threshold", globalThreads, localThreads, args, src.oclchannels(), src.depth());
+            int elemSize1 = sizeMap[depth];
+            int bufSize = elemSize1 * ocn;
+            std::vector<uchar> _buf(bufSize);
+            uchar * buf = &_buf[0];
+            scalarToRawData(sc, buf, CV_MAKE_TYPE(depth, cn));
+            memset(buf + elemSize1 * cn, 0, (ocn - cn) * elemSize1);
+
+            return _buf;
         }
 
-        static void threshold_32f(const oclMat &src, oclMat &dst, double thresh, double maxVal, int type)
+        static void threshold_runner(const oclMat &src, oclMat &dst, double thresh, double maxVal, int thresholdType)
         {
-            float thresh_f = thresh;
-            float max_val = maxVal;
-            int dst_offset = (dst.offset >> 2);
-            int dst_step = (dst.step >> 2);
-            int src_offset = (src.offset >> 2);
-            int src_step = (src.step >> 2);
+            bool ival = src.depth() < CV_32F;
+            std::vector<uchar> thresholdValue = scalarToVector(cv::Scalar::all(ival ? cvFloor(thresh) : thresh), dst.depth(),
+                                                               dst.oclchannels(), dst.channels());
+            std::vector<uchar> maxValue = scalarToVector(cv::Scalar::all(maxVal), dst.depth(), dst.oclchannels(), dst.channels());
 
-            size_t cols = (dst.cols + (dst_offset & 3) + 3) / 4;
-            size_t bSizeX = 16, bSizeY = 16;
-            size_t gSizeX = cols % bSizeX == 0 ? cols : (cols + bSizeX - 1) / bSizeX * bSizeX;
-            size_t gSizeY = dst.rows;
-            size_t globalThreads[3] = {gSizeX, gSizeY, 1};
-            size_t localThreads[3] = {bSizeX, bSizeY, 1};
+            size_t localThreads[3] = { 16, 16, 1 };
+            size_t globalThreads[3] = { dst.cols, dst.rows, 1 };
+
+            const char * const thresholdMap[] = { "THRESH_BINARY", "THRESH_BINARY_INV", "THRESH_TRUNC",
+                                                  "THRESH_TOZERO", "THRESH_TOZERO_INV" };
+            const char * const channelMap[] = { "", "", "2", "4", "4" };
+            const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
+            std::string buildOptions = format("-D T=%s%s -D %s", typeMap[src.depth()], channelMap[src.channels()],
+                                              thresholdMap[thresholdType]);
+
+            int src_step = src.step / src.elemSize(), src_offset = src.offset / src.elemSize();
+            int dst_step = dst.step / dst.elemSize(), dst_offset = dst.offset / dst.elemSize();
 
             vector< pair<size_t, const void *> > args;
-            args.push_back( make_pair(sizeof(cl_mem), &src.data));
-            args.push_back( make_pair(sizeof(cl_mem), &dst.data));
+            args.push_back( make_pair(sizeof(cl_mem), (void *)&src.data));
             args.push_back( make_pair(sizeof(cl_int), (void *)&src_offset));
             args.push_back( make_pair(sizeof(cl_int), (void *)&src_step));
+            args.push_back( make_pair(sizeof(cl_mem), (void *)&dst.data));
             args.push_back( make_pair(sizeof(cl_int), (void *)&dst_offset));
+            args.push_back( make_pair(sizeof(cl_int), (void *)&dst_step));
             args.push_back( make_pair(sizeof(cl_int), (void *)&dst.rows));
             args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols));
-            args.push_back( make_pair(sizeof(cl_int), (void *)&dst_step));
-            args.push_back( make_pair(sizeof(cl_float), (void *)&thresh_f));
-            args.push_back( make_pair(sizeof(cl_float), (void *)&max_val));
-            args.push_back( make_pair(sizeof(cl_int), (void *)&type));
-
-            openCLExecuteKernel(src.clCxt, &imgproc_threshold, "threshold", globalThreads, localThreads, args, src.oclchannels(), src.depth());
+            args.push_back( make_pair(thresholdValue.size(), (void *)&thresholdValue[0]));
+            args.push_back( make_pair(maxValue.size(), (void *)&maxValue[0]));
 
+            openCLExecuteKernel(src.clCxt, &imgproc_threshold, "threshold", globalThreads, localThreads, args,
+                                -1, -1, buildOptions.c_str());
         }
 
-        // threshold: support 8UC1 and 32FC1 data type and five threshold type
-        double threshold(const oclMat &src, oclMat &dst, double thresh, double maxVal, int type)
+        double threshold(const oclMat &src, oclMat &dst, double thresh, double maxVal, int thresholdType)
         {
-            //TODO: These limitations shall be removed later.
-            CV_Assert(src.type() == CV_8UC1 || src.type() == CV_32FC1);
-            CV_Assert(type == THRESH_BINARY || type == THRESH_BINARY_INV || type == THRESH_TRUNC
-                      || type == THRESH_TOZERO || type == THRESH_TOZERO_INV );
+            CV_Assert(thresholdType == THRESH_BINARY || thresholdType == THRESH_BINARY_INV || thresholdType == THRESH_TRUNC
+                      || thresholdType == THRESH_TOZERO || thresholdType == THRESH_TOZERO_INV);
 
-            static const gpuThresh_t gpuThresh_callers[2] = {threshold_8u, threshold_32f};
-
-            dst.create( src.size(), src.type() );
-            gpuThresh_callers[(src.type() == CV_32FC1)](src, dst, thresh, maxVal, type);
+            dst.create(src.size(), src.type());
+            threshold_runner(src, dst, thresh, maxVal, thresholdType);
 
             return thresh;
         }
diff --git a/modules/ocl/src/opencl/imgproc_threshold.cl b/modules/ocl/src/opencl/imgproc_threshold.cl
index 8d7c77e1f..81f2a7400 100644
--- a/modules/ocl/src/opencl/imgproc_threshold.cl
+++ b/modules/ocl/src/opencl/imgproc_threshold.cl
@@ -44,109 +44,37 @@
 //M*/
 
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_amd_fp64
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#elif defined (cl_khr_fp64)
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
 #endif
+#endif
 
-// threshold type:
-// enum { THRESH_BINARY=0, THRESH_BINARY_INV=1, THRESH_TRUNC=2, THRESH_TOZERO=3,
-//       THRESH_TOZERO_INV=4, THRESH_MASK=7, THRESH_OTSU=8 };
-
-__kernel void threshold_C1_D0(__global const uchar * restrict src, __global uchar *dst,
-                              int src_offset, int src_step,
-                              int dst_offset, int dst_rows, int dst_cols, int dst_step,
-                              uchar thresh, uchar max_val, int thresh_type
-                              )
+__kernel void threshold(__global const T * restrict src, int src_offset, int src_step,
+                        __global T * dst, int dst_offset, int dst_step,
+                        int rows, int cols, T thresh, T max_val)
 {
     int gx = get_global_id(0);
-    const int gy = get_global_id(1);
+    int gy = get_global_id(1);
 
-    int offset = (dst_offset & 15);
-    src_offset -= offset;
-
-    int dstart = (gx << 4) - offset;
-    if(dstart < dst_cols && gy < dst_rows)
+    if (gx < cols && gy < rows)
     {
-        uchar16 sdata = vload16(gx, src+src_offset+gy*src_step);
-        uchar16 ddata;
-        uchar16 zero = 0;
-        switch (thresh_type)
-        {
-            case 0:
-                ddata = ((sdata > thresh) ) ? (uchar16)(max_val) : (uchar16)(0);
-                break;
-            case 1:
-                ddata = ((sdata > thresh)) ? zero  : (uchar16)(max_val);
-                break;
-            case 2:
-                ddata = ((sdata > thresh)) ? (uchar16)(thresh) : sdata;
-                break;
-            case 3:
-                ddata = ((sdata > thresh)) ? sdata : zero;
-                break;
-            case 4:
-                ddata = ((sdata > thresh)) ? zero : sdata;
-                break;
-            default:
-                ddata = sdata;
-        }
-        int16 dpos = (int16)(dstart, dstart+1, dstart+2, dstart+3, dstart+4, dstart+5, dstart+6, dstart+7, dstart+8,
-                             dstart+9, dstart+10, dstart+11, dstart+12, dstart+13, dstart+14, dstart+15);
-        uchar16 dVal = *(__global uchar16*)(dst+dst_offset+gy*dst_step+dstart);
-        int16 con = dpos >= 0 && dpos < dst_cols;
-        ddata = convert_uchar16(con != 0) ? ddata : dVal;
-        if(dstart < dst_cols)
-        {
-            *(__global uchar16*)(dst+dst_offset+gy*dst_step+dstart) = ddata;
-        }
-    }
-}
-
-
-__kernel void threshold_C1_D5(__global const float * restrict src, __global float *dst,
-                              int src_offset, int src_step,
-                              int dst_offset, int dst_rows, int dst_cols, int dst_step,
-                              float thresh, float max_val, int thresh_type
-                              )
-{
-    const int gx = get_global_id(0);
-    const int gy = get_global_id(1);
-
-    int offset = (dst_offset & 3);
-    src_offset -= offset;
-
-    int dstart = (gx << 2) - offset;
-    if(dstart < dst_cols && gy < dst_rows)
-    {
-        float4 sdata = vload4(gx, src+src_offset+gy*src_step);
-        float4 ddata;
-        float4 zero = 0;
-        switch (thresh_type)
-        {
-            case 0:
-                ddata = sdata > thresh ? (float4)(max_val) : (float4)(0.f);
-                break;
-            case 1:
-                ddata = sdata > thresh ? zero : (float4)max_val;
-                break;
-            case 2:
-                ddata = sdata > thresh ? (float4)thresh : sdata;
-                break;
-            case 3:
-                ddata = sdata > thresh ? sdata : (float4)(0.f);
-                break;
-            case 4:
-                ddata = sdata > thresh ? (float4)(0.f) : sdata;
-                break;
-            default:
-                ddata = sdata;
-        }
-        int4 dpos = (int4)(dstart, dstart+1, dstart+2, dstart+3);
-        float4 dVal = *(__global float4*)(dst+dst_offset+gy*dst_step+dstart);
-        int4 con = dpos >= 0 && dpos < dst_cols;
-        ddata = convert_float4(con) != (float4)(0) ? ddata : dVal;
-        if(dstart < dst_cols)
-        {
-            *(__global float4*)(dst+dst_offset+gy*dst_step+dstart) = ddata;
-        }
+        int src_index = mad24(gy, src_step, src_offset + gx);
+        int dst_index = mad24(gy, dst_step, dst_offset + gx);
+
+        T sdata = src[src_index], zero = (T)(0);
+
+#ifdef THRESH_BINARY
+        dst[dst_index] = sdata > thresh ? max_val : zero;
+#elif defined THRESH_BINARY_INV
+        dst[dst_index] = sdata > thresh ? zero : max_val;
+#elif defined THRESH_TRUNC
+        dst[dst_index] = sdata > thresh ? thresh : sdata;
+#elif defined THRESH_TOZERO
+        dst[dst_index] = sdata > thresh ? sdata : zero;
+#elif defined THRESH_TOZERO_INV
+        dst[dst_index] = sdata > thresh ? zero : sdata;
+#endif
     }
 }
diff --git a/modules/ocl/test/test_imgproc.cpp b/modules/ocl/test/test_imgproc.cpp
index eb983fb17..c37f0377a 100644
--- a/modules/ocl/test/test_imgproc.cpp
+++ b/modules/ocl/test/test_imgproc.cpp
@@ -502,7 +502,9 @@ INSTANTIATE_TEST_CASE_P(Imgproc, Integral, Combine(
                             Bool()));
 
 INSTANTIATE_TEST_CASE_P(Imgproc, Threshold, Combine(
-                            Values(CV_8UC1, CV_32FC1),
+                            Values(CV_8UC1, CV_8UC2, CV_8UC3, CV_8UC4,
+                                   CV_16SC1, CV_16SC2, CV_16SC3, CV_16SC4,
+                                   CV_32FC1, CV_32FC2, CV_32FC3, CV_32FC4),
                             Values(0),
                             Values(ThreshOp(THRESH_BINARY),
                                    ThreshOp(THRESH_BINARY_INV), ThreshOp(THRESH_TRUNC),