From 47b092e5277119d96f4dd64b076e0c91ceaf78a9 Mon Sep 17 00:00:00 2001
From: vbystricky <user@user-pc.(none)>
Date: Thu, 15 May 2014 13:08:17 +0400
Subject: [PATCH 1/9] Optimize OpenCL LUT function

---
 modules/core/src/convert.cpp   |  27 +++++--
 modules/core/src/opencl/lut.cl | 129 +++++++++++++++++++++++++++++++--
 2 files changed, 143 insertions(+), 13 deletions(-)

diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp
index d88e42279..43fdc00b2 100644
--- a/modules/core/src/convert.cpp
+++ b/modules/core/src/convert.cpp
@@ -1544,22 +1544,33 @@ static LUTFunc lutTab[] =
 static bool ocl_LUT(InputArray _src, InputArray _lut, OutputArray _dst)
 {
     int lcn = _lut.channels(), dcn = _src.channels(), ddepth = _lut.depth();
+    int sdepth = _src.depth();
 
     UMat src = _src.getUMat(), lut = _lut.getUMat();
-    _dst.create(src.size(), CV_MAKETYPE(ddepth, dcn));
+    int dtype = CV_MAKETYPE(ddepth, dcn);
+    _dst.create(src.size(), dtype);
     UMat dst = _dst.getUMat();
 
-    ocl::Kernel k("LUT", ocl::core::lut_oclsrc,
-                  format("-D dcn=%d -D lcn=%d -D srcT=%s -D dstT=%s", dcn, lcn,
-                         ocl::typeToStr(src.depth()), ocl::memopTypeToStr(ddepth)));
-    if (k.empty())
+    size_t globalSize[2] = { dst.cols, dst.rows / 2};
+
+    cv::String build_opt = format("-D dcn=%d -D lcn=%d -D srcT=%s -D dstT=%s", dcn, lcn,
+                         ocl::typeToStr(sdepth), ocl::memopTypeToStr(ddepth)
+                         );
+
+    ocl::Kernel kernel;
+    if ((4 == lcn) && (CV_8U == sdepth))
+        kernel.create("LUTC4", ocl::core::lut_oclsrc, build_opt);
+    else if ((3 == lcn) && (CV_8U == sdepth))
+        kernel.create("LUTC3", ocl::core::lut_oclsrc, build_opt);
+    else
+        kernel.create("LUT", ocl::core::lut_oclsrc, build_opt);
+    if (kernel.empty())
         return false;
 
-    k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::ReadOnlyNoSize(lut),
+    kernel.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::ReadOnlyNoSize(lut),
            ocl::KernelArg::WriteOnly(dst));
 
-    size_t globalSize[2] = { dst.cols, dst.rows };
-    return k.run(2, globalSize, NULL, false);
+    return kernel.run(2, globalSize, NULL, true);
 }
 
 #endif
diff --git a/modules/core/src/opencl/lut.cl b/modules/core/src/opencl/lut.cl
index da92c2f34..9b0606145 100644
--- a/modules/core/src/opencl/lut.cl
+++ b/modules/core/src/opencl/lut.cl
@@ -34,30 +34,149 @@
 //
 //
 
-__kernel void LUT(__global const uchar * srcptr, int src_step, int src_offset,
+__kernel void LUTC4(__global const uchar * srcptr, int src_step, int src_offset,
                   __global const uchar * lutptr, int lut_step, int lut_offset,
                   __global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols)
 {
     int x = get_global_id(0);
-    int y = get_global_id(1);
+    int y = 2 * get_global_id(1);
+
+    __global const dstT * lut = (__global const dstT *)(lutptr + lut_offset);
+
+    __local dstT lut_l[256 * lcn];
+    int init = mad24(get_local_id(1), get_local_size(0), get_local_id(0));
+    int step = get_local_size(0) * get_local_size(1);
+
+    for (int i = init; i < 256 * lcn; i += step)
+    {
+        lut_l[i + 0] = lut[i + 0];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
 
     if (x < cols && y < rows)
     {
         int src_index = mad24(y, src_step, mad24(x, (int)sizeof(srcT) * dcn, src_offset));
         int dst_index = mad24(y, dst_step, mad24(x, (int)sizeof(dstT) * dcn, dst_offset));
 
+        __global const uchar4 * src = (__global const uchar4 *)(srcptr + src_index);
+        int4 idx = convert_int4(src[0]) * lcn + (int4)(0, 1, 2, 3);
+        __global dstT * dst = (__global dstT *)(dstptr + dst_index);
+
+        dst[0] = lut_l[idx.x];
+        dst[1] = lut_l[idx.y];
+        dst[2] = lut_l[idx.z];
+        dst[3] = lut_l[idx.w];
+
+        if (y < rows - 1)
+        {
+            src = (__global const uchar4 *)(srcptr + src_index + src_step);
+            idx = convert_int4(src[0]) * lcn + (int4)(0, 1, 2, 3);
+            dst = (__global dstT *)(dstptr + dst_index + dst_step);
+
+            dst[0] = lut_l[idx.x];
+            dst[1] = lut_l[idx.y];
+            dst[2] = lut_l[idx.z];
+            dst[3] = lut_l[idx.w];
+        }
+    }
+}
+
+__kernel void LUTC3(__global const uchar * srcptr, int src_step, int src_offset,
+                  __global const uchar * lutptr, int lut_step, int lut_offset,
+                  __global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols)
+{
+    int x = get_global_id(0);
+    int y = 2 * get_global_id(1);
+
+    __global const dstT * lut = (__global const dstT *)(lutptr + lut_offset);
+
+    __local dstT lut_l[256 * lcn];
+    int init = mad24(get_local_id(1), get_local_size(0), get_local_id(0));
+    int step = get_local_size(0) * get_local_size(1);
+
+    for (int i = init; i < 256 * lcn; i += step)
+    {
+        lut_l[i + 0] = lut[i + 0];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (x < cols && y < rows)
+    {
+        int src_index = mad24(y, src_step, mad24(x, (int)sizeof(srcT) * dcn, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, (int)sizeof(dstT) * dcn, dst_offset));
+
+        uchar3 src_pixel = vload3(0, (__global const uchar *)(srcptr + src_index));
+        int3 idx = convert_int3(src_pixel) * lcn + (int3)(0, 1, 2);
+        __global dstT * dst = (__global dstT *)(dstptr + dst_index);
+
+        dst[0] = lut_l[idx.x];
+        dst[1] = lut_l[idx.y];
+        dst[2] = lut_l[idx.z];
+        if (y < rows - 1)
+        {
+            uchar3 src_pixel = vload3(0, (__global const uchar *)(srcptr + src_index + src_step));
+            idx = convert_int3(src_pixel) * lcn + (int3)(0, 1, 2);
+            dst = (__global dstT *)(dstptr + dst_index + dst_step);
+
+            dst[0] = lut_l[idx.x];
+            dst[1] = lut_l[idx.y];
+            dst[2] = lut_l[idx.z];
+        }
+    }
+}
+
+__kernel void LUT(__global const uchar * srcptr, int src_step, int src_offset,
+                  __global const uchar * lutptr, int lut_step, int lut_offset,
+                  __global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols)
+{
+    __global const dstT * lut = (__global const dstT *)(lutptr + lut_offset);
+
+    __local dstT lut_l[256 * lcn];
+    int init = mad24(get_local_id(1), get_local_size(0), get_local_id(0));
+    int step = get_local_size(0) * get_local_size(1);
+
+    for (int i = init; i < 256 * lcn; i += step)
+    {
+        lut_l[i + 0] = lut[i + 0];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int x = get_global_id(0);
+    int y = 2 * get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src_index = mad24(y, src_step, mad24(x, (int)sizeof(srcT) * dcn, src_offset));
+
         __global const srcT * src = (__global const srcT *)(srcptr + src_index);
         __global const dstT * lut = (__global const dstT *)(lutptr + lut_offset);
+
+        int dst_index = mad24(y, dst_step, mad24(x, (int)sizeof(dstT) * dcn, dst_offset));
         __global dstT * dst = (__global dstT *)(dstptr + dst_index);
 
 #if lcn == 1
         #pragma unroll
         for (int cn = 0; cn < dcn; ++cn)
-            dst[cn] = lut[src[cn]];
-#else
+            dst[cn] = lut_l[src[cn]];
+#else //lcn == scn == dcn
         #pragma unroll
         for (int cn = 0; cn < dcn; ++cn)
-            dst[cn] = lut[mad24(src[cn], dcn, cn)];
+            dst[cn] = lut_l[mad24(src[cn], lcn, cn)];
 #endif
+        if (y < rows - 1)
+        {
+            src = (__global const srcT *)(srcptr + src_index + src_step);
+            dst = (__global dstT *)(dstptr + dst_index + dst_step);
+
+#if lcn == 1
+            #pragma unroll
+            for (int cn = 0; cn < dcn; ++cn)
+                dst[cn] = lut_l[src[cn]];
+#else //lcn == scn == dcn
+            #pragma unroll
+            for (int cn = 0; cn < dcn; ++cn)
+                dst[cn] = lut_l[mad24(src[cn], lcn, cn)];
+#endif
+        }
     }
 }

From 72727111c71404d6597617cca490ea610b7a71e3 Mon Sep 17 00:00:00 2001
From: vbystricky <user@user-pc.(none)>
Date: Fri, 16 May 2014 19:11:58 +0400
Subject: [PATCH 2/9] Use 4 pixels for one unit. Some ocl code refactoring

---
 modules/core/src/convert.cpp   |  23 ++--
 modules/core/src/opencl/lut.cl | 219 ++++++++++++++-------------------
 2 files changed, 102 insertions(+), 140 deletions(-)

diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp
index 43fdc00b2..6e474d951 100644
--- a/modules/core/src/convert.cpp
+++ b/modules/core/src/convert.cpp
@@ -1547,30 +1547,21 @@ static bool ocl_LUT(InputArray _src, InputArray _lut, OutputArray _dst)
     int sdepth = _src.depth();
 
     UMat src = _src.getUMat(), lut = _lut.getUMat();
-    int dtype = CV_MAKETYPE(ddepth, dcn);
-    _dst.create(src.size(), dtype);
+    _dst.create(src.size(), CV_MAKETYPE(ddepth, dcn));
     UMat dst = _dst.getUMat();
 
-    size_t globalSize[2] = { dst.cols, dst.rows / 2};
+    size_t globalSize[2] = { dst.cols, (dst.rows + 3) / 4};
 
-    cv::String build_opt = format("-D dcn=%d -D lcn=%d -D srcT=%s -D dstT=%s", dcn, lcn,
+    ocl::Kernel k("LUT", ocl::core::lut_oclsrc, format("-D dcn=%d -D lcn=%d -D srcT=%s -D dstT=%s", dcn, lcn,
                          ocl::typeToStr(sdepth), ocl::memopTypeToStr(ddepth)
-                         );
-
-    ocl::Kernel kernel;
-    if ((4 == lcn) && (CV_8U == sdepth))
-        kernel.create("LUTC4", ocl::core::lut_oclsrc, build_opt);
-    else if ((3 == lcn) && (CV_8U == sdepth))
-        kernel.create("LUTC3", ocl::core::lut_oclsrc, build_opt);
-    else
-        kernel.create("LUT", ocl::core::lut_oclsrc, build_opt);
-    if (kernel.empty())
+                         ));
+    if (k.empty())
         return false;
 
-    kernel.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::ReadOnlyNoSize(lut),
+    k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::ReadOnlyNoSize(lut),
            ocl::KernelArg::WriteOnly(dst));
 
-    return kernel.run(2, globalSize, NULL, true);
+    return k.run(2, globalSize, NULL, false);
 }
 
 #endif
diff --git a/modules/core/src/opencl/lut.cl b/modules/core/src/opencl/lut.cl
index 9b0606145..27428ed2b 100644
--- a/modules/core/src/opencl/lut.cl
+++ b/modules/core/src/opencl/lut.cl
@@ -34,149 +34,120 @@
 //
 //
 
-__kernel void LUTC4(__global const uchar * srcptr, int src_step, int src_offset,
-                  __global const uchar * lutptr, int lut_step, int lut_offset,
-                  __global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols)
-{
-    int x = get_global_id(0);
-    int y = 2 * get_global_id(1);
-
-    __global const dstT * lut = (__global const dstT *)(lutptr + lut_offset);
-
-    __local dstT lut_l[256 * lcn];
-    int init = mad24(get_local_id(1), get_local_size(0), get_local_id(0));
-    int step = get_local_size(0) * get_local_size(1);
-
-    for (int i = init; i < 256 * lcn; i += step)
-    {
-        lut_l[i + 0] = lut[i + 0];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (x < cols && y < rows)
-    {
-        int src_index = mad24(y, src_step, mad24(x, (int)sizeof(srcT) * dcn, src_offset));
-        int dst_index = mad24(y, dst_step, mad24(x, (int)sizeof(dstT) * dcn, dst_offset));
-
-        __global const uchar4 * src = (__global const uchar4 *)(srcptr + src_index);
-        int4 idx = convert_int4(src[0]) * lcn + (int4)(0, 1, 2, 3);
-        __global dstT * dst = (__global dstT *)(dstptr + dst_index);
-
-        dst[0] = lut_l[idx.x];
-        dst[1] = lut_l[idx.y];
-        dst[2] = lut_l[idx.z];
-        dst[3] = lut_l[idx.w];
-
-        if (y < rows - 1)
-        {
-            src = (__global const uchar4 *)(srcptr + src_index + src_step);
-            idx = convert_int4(src[0]) * lcn + (int4)(0, 1, 2, 3);
-            dst = (__global dstT *)(dstptr + dst_index + dst_step);
-
-            dst[0] = lut_l[idx.x];
-            dst[1] = lut_l[idx.y];
-            dst[2] = lut_l[idx.z];
+#if lcn == 1
+    #if dcn == 4
+        #define LUT_OP(num)\
+            uchar4 idx = vload4(0, (__global const uchar *)(srcptr + src_index  + num * src_step));\
+            dst = (__global dstT *)(dstptr + dst_index + num * dst_step);\
+            dst[0] = lut_l[idx.x];\
+            dst[1] = lut_l[idx.y];\
+            dst[2] = lut_l[idx.z];\
             dst[3] = lut_l[idx.w];
-        }
-    }
-}
-
-__kernel void LUTC3(__global const uchar * srcptr, int src_step, int src_offset,
-                  __global const uchar * lutptr, int lut_step, int lut_offset,
-                  __global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols)
-{
-    int x = get_global_id(0);
-    int y = 2 * get_global_id(1);
-
-    __global const dstT * lut = (__global const dstT *)(lutptr + lut_offset);
-
-    __local dstT lut_l[256 * lcn];
-    int init = mad24(get_local_id(1), get_local_size(0), get_local_id(0));
-    int step = get_local_size(0) * get_local_size(1);
-
-    for (int i = init; i < 256 * lcn; i += step)
-    {
-        lut_l[i + 0] = lut[i + 0];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (x < cols && y < rows)
-    {
-        int src_index = mad24(y, src_step, mad24(x, (int)sizeof(srcT) * dcn, src_offset));
-        int dst_index = mad24(y, dst_step, mad24(x, (int)sizeof(dstT) * dcn, dst_offset));
-
-        uchar3 src_pixel = vload3(0, (__global const uchar *)(srcptr + src_index));
-        int3 idx = convert_int3(src_pixel) * lcn + (int3)(0, 1, 2);
-        __global dstT * dst = (__global dstT *)(dstptr + dst_index);
-
-        dst[0] = lut_l[idx.x];
-        dst[1] = lut_l[idx.y];
-        dst[2] = lut_l[idx.z];
-        if (y < rows - 1)
-        {
-            uchar3 src_pixel = vload3(0, (__global const uchar *)(srcptr + src_index + src_step));
-            idx = convert_int3(src_pixel) * lcn + (int3)(0, 1, 2);
-            dst = (__global dstT *)(dstptr + dst_index + dst_step);
-
-            dst[0] = lut_l[idx.x];
-            dst[1] = lut_l[idx.y];
+    #elif dcn == 3
+        #define LUT_OP(num)\
+            uchar3 idx = vload3(0, (__global const uchar *)(srcptr + src_index  + num * src_step));\
+            dst = (__global dstT *)(dstptr + dst_index + num * dst_step);\
+            dst[0] = lut_l[idx.x];\
+            dst[1] = lut_l[idx.y];\
             dst[2] = lut_l[idx.z];
-        }
+    #elif dcn == 2
+        #define LUT_OP(num)\
+            uchar2 idx = vload2(0, (__global const uchar *)(srcptr + src_index  + num * src_step));\
+            dst = (__global dstT *)(dstptr + dst_index + num * dst_step);\
+            dst[0] = lut_l[idx.x];\
+            dst[1] = lut_l[idx.y];
+    #elif dcn == 1
+        #define LUT_OP(num)\
+            uchar idx = (__global const uchar *)(srcptr + src_index  + num * src_step)[0];\
+            dst = (__global dstT *)(dstptr + dst_index + num * dst_step);\
+            dst[0] = lut_l[idx];
+    #else
+        #define LUT_OP(num)\
+            src = (__global const srcT *)(srcptr + src_index + num * src_step);\
+            dst = (__global dstT *)(dstptr + dst_index + num * dst_step);\
+            for (int cn = 0; cn < dcn; ++cn)\
+                dst[cn] = lut_l[src[cn]];
+    #endif
+#else
+    #if dcn == 4
+        #define LUT_OP(num)\
+            uchar4 src_pixel = vload4(0, (__global const uchar *)(srcptr + src_index  + num * src_step));\
+            int4 idx = convert_int4(src_pixel) * lcn + (int4)(0, 1, 2, 3);\
+            dst = (__global dstT *)(dstptr + dst_index + num * dst_step);\
+            dst[0] = lut_l[idx.x];\
+            dst[1] = lut_l[idx.y];\
+            dst[2] = lut_l[idx.z];\
+            dst[3] = lut_l[idx.w];
+    #elif dcn == 3
+        #define LUT_OP(num)\
+            uchar3 src_pixel = vload3(0, (__global const uchar *)(srcptr + src_index  + num * src_step));\
+            int3 idx = convert_int3(src_pixel) * lcn + (int3)(0, 1, 2);\
+            dst = (__global dstT *)(dstptr + dst_index + num * dst_step);\
+            dst[0] = lut_l[idx.x];\
+            dst[1] = lut_l[idx.y];\
+            dst[2] = lut_l[idx.z];
+    #elif dcn == 2
+        #define LUT_OP(num)\
+            uchar2 src_pixel = vload2(0, (__global const uchar *)(srcptr + src_index  + num * src_step));\
+            int2 idx = convert_int2(src_pixel) * lcn + (int2)(0, 1);\
+            dst = (__global dstT *)(dstptr + dst_index + num * dst_step);\
+            dst[0] = lut_l[idx.x];\
+            dst[1] = lut_l[idx.y];
+    #elif dcn == 1 //error case (1 < lcn) ==> lcn == scn == dcn
+        #define LUT_OP(num)\
+            uchar idx = (__global const uchar *)(srcptr + src_index  + num * src_step)[0];\
+            dst = (__global dstT *)(dstptr + dst_index + num * dst_step);\
+            dst[0] = lut_l[idx];
+    #else
+        #define LUT_OP(num)\
+            src = (__global const srcT *)(srcptr + src_index + num * src_step);\
+            dst = (__global dstT *)(dstptr + dst_index + num * dst_step);\
+            for (int cn = 0; cn < dcn; ++cn)\
+                dst[cn] = lut_l[mad24(src[cn], lcn, cn)];
+    #endif
+#endif
+
+#define LOCAL_LUT_INIT\
+    {\
+        __global const dstT * lut = (__global const dstT *)(lutptr + lut_offset);\
+        int init = mad24((int)get_local_id(1), (int)get_local_size(0), (int)get_local_id(0));\
+        int step = get_local_size(0) * get_local_size(1);\
+        for (int i = init; i < 256 * lcn; i += step)\
+        {\
+            lut_l[i] = lut[i];\
+        }\
+        barrier(CLK_LOCAL_MEM_FENCE);\
     }
-}
 
 __kernel void LUT(__global const uchar * srcptr, int src_step, int src_offset,
                   __global const uchar * lutptr, int lut_step, int lut_offset,
                   __global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols)
 {
-    __global const dstT * lut = (__global const dstT *)(lutptr + lut_offset);
-
     __local dstT lut_l[256 * lcn];
-    int init = mad24(get_local_id(1), get_local_size(0), get_local_id(0));
-    int step = get_local_size(0) * get_local_size(1);
-
-    for (int i = init; i < 256 * lcn; i += step)
-    {
-        lut_l[i + 0] = lut[i + 0];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
+    LOCAL_LUT_INIT;
 
     int x = get_global_id(0);
-    int y = 2 * get_global_id(1);
+    int y = 4 * get_global_id(1);
 
     if (x < cols && y < rows)
     {
         int src_index = mad24(y, src_step, mad24(x, (int)sizeof(srcT) * dcn, src_offset));
-
-        __global const srcT * src = (__global const srcT *)(srcptr + src_index);
-        __global const dstT * lut = (__global const dstT *)(lutptr + lut_offset);
-
         int dst_index = mad24(y, dst_step, mad24(x, (int)sizeof(dstT) * dcn, dst_offset));
-        __global dstT * dst = (__global dstT *)(dstptr + dst_index);
+        __global const srcT * src; __global dstT * dst;
 
-#if lcn == 1
-        #pragma unroll
-        for (int cn = 0; cn < dcn; ++cn)
-            dst[cn] = lut_l[src[cn]];
-#else //lcn == scn == dcn
-        #pragma unroll
-        for (int cn = 0; cn < dcn; ++cn)
-            dst[cn] = lut_l[mad24(src[cn], lcn, cn)];
-#endif
+        LUT_OP(0);
         if (y < rows - 1)
         {
-            src = (__global const srcT *)(srcptr + src_index + src_step);
-            dst = (__global dstT *)(dstptr + dst_index + dst_step);
-
-#if lcn == 1
-            #pragma unroll
-            for (int cn = 0; cn < dcn; ++cn)
-                dst[cn] = lut_l[src[cn]];
-#else //lcn == scn == dcn
-            #pragma unroll
-            for (int cn = 0; cn < dcn; ++cn)
-                dst[cn] = lut_l[mad24(src[cn], lcn, cn)];
-#endif
+            LUT_OP(1);
+            if (y < rows - 2)
+            {
+                LUT_OP(2);
+                if (y < rows - 3)
+                {
+                    LUT_OP(3);
+                }
+            }
         }
+
     }
 }

From 6667cea0f436386845b1636f12fc86464b427844 Mon Sep 17 00:00:00 2001
From: vbystricky <user@user-pc.(none)>
Date: Mon, 19 May 2014 13:58:14 +0400
Subject: [PATCH 3/9] Optimize OpenCL LUT function

---
 modules/core/src/convert.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp
index 6e474d951..72c2fdcb0 100644
--- a/modules/core/src/convert.cpp
+++ b/modules/core/src/convert.cpp
@@ -1550,9 +1550,8 @@ static bool ocl_LUT(InputArray _src, InputArray _lut, OutputArray _dst)
     _dst.create(src.size(), CV_MAKETYPE(ddepth, dcn));
     UMat dst = _dst.getUMat();
 
-    size_t globalSize[2] = { dst.cols, (dst.rows + 3) / 4};
-
-    ocl::Kernel k("LUT", ocl::core::lut_oclsrc, format("-D dcn=%d -D lcn=%d -D srcT=%s -D dstT=%s", dcn, lcn,
+    ocl::Kernel k("LUT", ocl::core::lut_oclsrc,
+                  format("-D dcn=%d -D lcn=%d -D srcT=%s -D dstT=%s", dcn, lcn,
                          ocl::typeToStr(sdepth), ocl::memopTypeToStr(ddepth)
                          ));
     if (k.empty())
@@ -1561,6 +1560,7 @@ static bool ocl_LUT(InputArray _src, InputArray _lut, OutputArray _dst)
     k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::ReadOnlyNoSize(lut),
            ocl::KernelArg::WriteOnly(dst));
 
+    size_t globalSize[2] = { dst.cols, (dst.rows + 3) / 4};
     return k.run(2, globalSize, NULL, false);
 }
 

From a8bfab3cb740283181d42aa852b2bb2561c73619 Mon Sep 17 00:00:00 2001
From: vbystricky <user@user-pc.(none)>
Date: Mon, 19 May 2014 13:59:54 +0400
Subject: [PATCH 4/9] Optimize OpenCL LUT function

---
 modules/core/src/convert.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp
index 72c2fdcb0..1f53fa4cb 100644
--- a/modules/core/src/convert.cpp
+++ b/modules/core/src/convert.cpp
@@ -1544,7 +1544,6 @@ static LUTFunc lutTab[] =
 static bool ocl_LUT(InputArray _src, InputArray _lut, OutputArray _dst)
 {
     int lcn = _lut.channels(), dcn = _src.channels(), ddepth = _lut.depth();
-    int sdepth = _src.depth();
 
     UMat src = _src.getUMat(), lut = _lut.getUMat();
     _dst.create(src.size(), CV_MAKETYPE(ddepth, dcn));
@@ -1552,7 +1551,7 @@ static bool ocl_LUT(InputArray _src, InputArray _lut, OutputArray _dst)
 
     ocl::Kernel k("LUT", ocl::core::lut_oclsrc,
                   format("-D dcn=%d -D lcn=%d -D srcT=%s -D dstT=%s", dcn, lcn,
-                         ocl::typeToStr(sdepth), ocl::memopTypeToStr(ddepth)
+                         ocl::typeToStr(src.depth()), ocl::memopTypeToStr(ddepth)
                          ));
     if (k.empty())
         return false;

From 48d82fd911d379574cca55fec1286813ed2b8eb1 Mon Sep 17 00:00:00 2001
From: vbystricky <user@user-pc.(none)>
Date: Mon, 26 May 2014 16:31:42 +0400
Subject: [PATCH 5/9] Fix some errors

---
 modules/core/src/opencl/lut.cl | 58 +++++++++++++++++-----------------
 1 file changed, 29 insertions(+), 29 deletions(-)

diff --git a/modules/core/src/opencl/lut.cl b/modules/core/src/opencl/lut.cl
index 27428ed2b..9646809e3 100644
--- a/modules/core/src/opencl/lut.cl
+++ b/modules/core/src/opencl/lut.cl
@@ -37,71 +37,71 @@
 #if lcn == 1
     #if dcn == 4
         #define LUT_OP(num)\
-            uchar4 idx = vload4(0, (__global const uchar *)(srcptr + src_index  + num * src_step));\
-            dst = (__global dstT *)(dstptr + dst_index + num * dst_step);\
-            dst[0] = lut_l[idx.x];\
-            dst[1] = lut_l[idx.y];\
-            dst[2] = lut_l[idx.z];\
-            dst[3] = lut_l[idx.w];
+            __global const uchar4 *idx = (__global const uchar4 *)(srcptr + mad24(num, src_step, src_index));\
+            dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
+            dst[0] = lut_l[idx->x];\
+            dst[1] = lut_l[idx->y];\
+            dst[2] = lut_l[idx->z];\
+            dst[3] = lut_l[idx->w];
     #elif dcn == 3
         #define LUT_OP(num)\
-            uchar3 idx = vload3(0, (__global const uchar *)(srcptr + src_index  + num * src_step));\
-            dst = (__global dstT *)(dstptr + dst_index + num * dst_step);\
+            uchar3 idx = vload3(0, (__global const uchar *)(srcptr + mad24(num, src_step, src_index)));\
+            dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
             dst[0] = lut_l[idx.x];\
             dst[1] = lut_l[idx.y];\
             dst[2] = lut_l[idx.z];
     #elif dcn == 2
         #define LUT_OP(num)\
-            uchar2 idx = vload2(0, (__global const uchar *)(srcptr + src_index  + num * src_step));\
-            dst = (__global dstT *)(dstptr + dst_index + num * dst_step);\
-            dst[0] = lut_l[idx.x];\
-            dst[1] = lut_l[idx.y];
+            __global const uchar2 * idx = (__global const uchar *)(srcptr + mad24(num, src_step, src_index));\
+            dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
+            dst[0] = lut_l[idx->x];\
+            dst[1] = lut_l[idx->y];
     #elif dcn == 1
         #define LUT_OP(num)\
-            uchar idx = (__global const uchar *)(srcptr + src_index  + num * src_step)[0];\
-            dst = (__global dstT *)(dstptr + dst_index + num * dst_step);\
+            uchar idx = (__global const uchar *)(srcptr + mad24(num, src_step, src_index))[0];\
+            dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
             dst[0] = lut_l[idx];
     #else
         #define LUT_OP(num)\
-            src = (__global const srcT *)(srcptr + src_index + num * src_step);\
-            dst = (__global dstT *)(dstptr + dst_index + num * dst_step);\
+            src = (__global const srcT *)(srcptr + mad24(num, src_step, src_index));\
+            dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
             for (int cn = 0; cn < dcn; ++cn)\
                 dst[cn] = lut_l[src[cn]];
     #endif
 #else
     #if dcn == 4
         #define LUT_OP(num)\
-            uchar4 src_pixel = vload4(0, (__global const uchar *)(srcptr + src_index  + num * src_step));\
-            int4 idx = convert_int4(src_pixel) * lcn + (int4)(0, 1, 2, 3);\
-            dst = (__global dstT *)(dstptr + dst_index + num * dst_step);\
+            __global const uchar4 *src_pixel = (__global const uchar4 *)(srcptr + mad24(num, src_step, src_index));\
+            int4 idx = convert_int4(src_pixel[0]) * lcn + (int4)(0, 1, 2, 3);\
+            dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
             dst[0] = lut_l[idx.x];\
             dst[1] = lut_l[idx.y];\
             dst[2] = lut_l[idx.z];\
             dst[3] = lut_l[idx.w];
     #elif dcn == 3
         #define LUT_OP(num)\
-            uchar3 src_pixel = vload3(0, (__global const uchar *)(srcptr + src_index  + num * src_step));\
+            uchar3 src_pixel = vload3(0, (__global const uchar *)(srcptr + mad24(num, src_step, src_index)));\
             int3 idx = convert_int3(src_pixel) * lcn + (int3)(0, 1, 2);\
-            dst = (__global dstT *)(dstptr + dst_index + num * dst_step);\
+            dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
             dst[0] = lut_l[idx.x];\
             dst[1] = lut_l[idx.y];\
             dst[2] = lut_l[idx.z];
     #elif dcn == 2
         #define LUT_OP(num)\
-            uchar2 src_pixel = vload2(0, (__global const uchar *)(srcptr + src_index  + num * src_step));\
-            int2 idx = convert_int2(src_pixel) * lcn + (int2)(0, 1);\
-            dst = (__global dstT *)(dstptr + dst_index + num * dst_step);\
+            __global const uchar2 *src_pixel = (__global const uchar2 *)(srcptr + mad24(num, src_step, src_index));\
+            int2 idx = convert_int2(src_pixel[0]) * lcn + (int2)(0, 1);\
+            dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
             dst[0] = lut_l[idx.x];\
             dst[1] = lut_l[idx.y];
     #elif dcn == 1 //error case (1 < lcn) ==> lcn == scn == dcn
         #define LUT_OP(num)\
-            uchar idx = (__global const uchar *)(srcptr + src_index  + num * src_step)[0];\
-            dst = (__global dstT *)(dstptr + dst_index + num * dst_step);\
+            uchar idx = (__global const uchar *)(srcptr + mad24(num, src_step, src_index))[0];\
+            dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
             dst[0] = lut_l[idx];
     #else
         #define LUT_OP(num)\
-            src = (__global const srcT *)(srcptr + src_index + num * src_step);\
-            dst = (__global dstT *)(dstptr + dst_index + num * dst_step);\
+            src = (__global const srcT *)(srcptr + mad24(num, src_step, src_index));\
+            dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
             for (int cn = 0; cn < dcn; ++cn)\
                 dst[cn] = lut_l[mad24(src[cn], lcn, cn)];
     #endif
@@ -134,7 +134,7 @@ __kernel void LUT(__global const uchar * srcptr, int src_step, int src_offset,
         int src_index = mad24(y, src_step, mad24(x, (int)sizeof(srcT) * dcn, src_offset));
         int dst_index = mad24(y, dst_step, mad24(x, (int)sizeof(dstT) * dcn, dst_offset));
         __global const srcT * src; __global dstT * dst;
-
+        int tmp_idx;
         LUT_OP(0);
         if (y < rows - 1)
         {

From cfabf32492b66cde3a9eb94a4882908498d059e2 Mon Sep 17 00:00:00 2001
From: vbystricky <user@user-pc.(none)>
Date: Mon, 26 May 2014 16:51:48 +0400
Subject: [PATCH 6/9] Fix some errors

---
 modules/core/src/opencl/lut.cl | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/modules/core/src/opencl/lut.cl b/modules/core/src/opencl/lut.cl
index 9646809e3..f6bd367c6 100644
--- a/modules/core/src/opencl/lut.cl
+++ b/modules/core/src/opencl/lut.cl
@@ -45,20 +45,20 @@
             dst[3] = lut_l[idx->w];
     #elif dcn == 3
         #define LUT_OP(num)\
-            uchar3 idx = vload3(0, (__global const uchar *)(srcptr + mad24(num, src_step, src_index)));\
+            uchar3 idx = vload3(0, srcptr + mad24(num, src_step, src_index));\
             dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
             dst[0] = lut_l[idx.x];\
             dst[1] = lut_l[idx.y];\
             dst[2] = lut_l[idx.z];
     #elif dcn == 2
         #define LUT_OP(num)\
-            __global const uchar2 * idx = (__global const uchar *)(srcptr + mad24(num, src_step, src_index));\
+            __global const uchar2 * idx = (__global const uchar2 *)(srcptr + mad24(num, src_step, src_index));\
             dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
             dst[0] = lut_l[idx->x];\
             dst[1] = lut_l[idx->y];
     #elif dcn == 1
         #define LUT_OP(num)\
-            uchar idx = (__global const uchar *)(srcptr + mad24(num, src_step, src_index))[0];\
+            uchar idx = (srcptr + mad24(num, src_step, src_index))[0];\
             dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
             dst[0] = lut_l[idx];
     #else
@@ -80,7 +80,7 @@
             dst[3] = lut_l[idx.w];
     #elif dcn == 3
         #define LUT_OP(num)\
-            uchar3 src_pixel = vload3(0, (__global const uchar *)(srcptr + mad24(num, src_step, src_index)));\
+            uchar3 src_pixel = vload3(0, srcptr + mad24(num, src_step, src_index));\
             int3 idx = convert_int3(src_pixel) * lcn + (int3)(0, 1, 2);\
             dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
             dst[0] = lut_l[idx.x];\
@@ -95,7 +95,7 @@
             dst[1] = lut_l[idx.y];
     #elif dcn == 1 //error case (1 < lcn) ==> lcn == scn == dcn
         #define LUT_OP(num)\
-            uchar idx = (__global const uchar *)(srcptr + mad24(num, src_step, src_index))[0];\
+            uchar idx = (srcptr + mad24(num, src_step, src_index))[0];\
             dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
             dst[0] = lut_l[idx];
     #else

From 8a5f2781fcafc9259b989d89546d2b2a0120c254 Mon Sep 17 00:00:00 2001
From: VBystricky <vladimir.bystricky@stdutility.com>
Date: Wed, 4 Jun 2014 20:13:42 +0400
Subject: [PATCH 7/9] Fix kernel by comments

---
 modules/core/src/opencl/lut.cl | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/modules/core/src/opencl/lut.cl b/modules/core/src/opencl/lut.cl
index f6bd367c6..a33d50c6f 100644
--- a/modules/core/src/opencl/lut.cl
+++ b/modules/core/src/opencl/lut.cl
@@ -37,12 +37,12 @@
 #if lcn == 1
     #if dcn == 4
         #define LUT_OP(num)\
-            __global const uchar4 *idx = (__global const uchar4 *)(srcptr + mad24(num, src_step, src_index));\
+            int idx = *(__global const int *)(srcptr + mad24(num, src_step, src_index));\
             dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
-            dst[0] = lut_l[idx->x];\
-            dst[1] = lut_l[idx->y];\
-            dst[2] = lut_l[idx->z];\
-            dst[3] = lut_l[idx->w];
+            dst[0] = lut_l[idx & 0xff];\
+            dst[1] = lut_l[(idx >> 8) & 0xff];\
+            dst[2] = lut_l[(idx >> 16) & 0xff];\
+            dst[3] = lut_l[(idx >> 24) & 0xff];
     #elif dcn == 3
         #define LUT_OP(num)\
             uchar3 idx = vload3(0, srcptr + mad24(num, src_step, src_index));\

From 0c0ebca85566fb2e327f4c0cd24dc71b98c61ea9 Mon Sep 17 00:00:00 2001
From: VBystricky <vladimir.bystricky@stdutility.com>
Date: Wed, 4 Jun 2014 23:50:23 +0400
Subject: [PATCH 8/9] Read 4 pixel for aligned data with 1 channel

---
 modules/core/src/convert.cpp   |  8 ++++++--
 modules/core/src/opencl/lut.cl | 22 ++++++++++++++++++----
 2 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp
index 1f53fa4cb..162eaacb9 100644
--- a/modules/core/src/convert.cpp
+++ b/modules/core/src/convert.cpp
@@ -1548,10 +1548,12 @@ static bool ocl_LUT(InputArray _src, InputArray _lut, OutputArray _dst)
     UMat src = _src.getUMat(), lut = _lut.getUMat();
     _dst.create(src.size(), CV_MAKETYPE(ddepth, dcn));
     UMat dst = _dst.getUMat();
+    bool bAligned = (1 == dcn) && (0 == (src.offset % 4)) && (0 == (src.cols % 4));
 
     ocl::Kernel k("LUT", ocl::core::lut_oclsrc,
-                  format("-D dcn=%d -D lcn=%d -D srcT=%s -D dstT=%s", dcn, lcn,
-                         ocl::typeToStr(src.depth()), ocl::memopTypeToStr(ddepth)
+                  format("-D dcn=%d -D lcn=%d -D srcT=%s -D dstT=%s%s", dcn, lcn,
+                         ocl::typeToStr(src.depth()), ocl::memopTypeToStr(ddepth),
+                         bAligned ? " -D USE_ALIGNED" : ""
                          ));
     if (k.empty())
         return false;
@@ -1560,6 +1562,8 @@ static bool ocl_LUT(InputArray _src, InputArray _lut, OutputArray _dst)
            ocl::KernelArg::WriteOnly(dst));
 
     size_t globalSize[2] = { dst.cols, (dst.rows + 3) / 4};
+    if (bAligned)
+        globalSize[0] = (dst.cols + 3) / 4;
     return k.run(2, globalSize, NULL, false);
 }
 
diff --git a/modules/core/src/opencl/lut.cl b/modules/core/src/opencl/lut.cl
index a33d50c6f..295f0ae71 100644
--- a/modules/core/src/opencl/lut.cl
+++ b/modules/core/src/opencl/lut.cl
@@ -57,10 +57,20 @@
             dst[0] = lut_l[idx->x];\
             dst[1] = lut_l[idx->y];
     #elif dcn == 1
-        #define LUT_OP(num)\
-            uchar idx = (srcptr + mad24(num, src_step, src_index))[0];\
-            dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
-            dst[0] = lut_l[idx];
+        #ifdef USE_ALIGNED
+            #define LUT_OP(num)\
+                int idx = *(__global const int *)(srcptr + mad24(num, src_step, src_index));\
+                dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
+                dst[0] = lut_l[idx & 0xff];\
+                dst[1] = lut_l[(idx >> 8) & 0xff];\
+                dst[2] = lut_l[(idx >> 16) & 0xff];\
+                dst[3] = lut_l[(idx >> 24) & 0xff];
+        #else
+            #define LUT_OP(num)\
+                uchar idx = (srcptr + mad24(num, src_step, src_index))[0];\
+                dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
+                dst[0] = lut_l[idx];
+        #endif
     #else
         #define LUT_OP(num)\
             src = (__global const srcT *)(srcptr + mad24(num, src_step, src_index));\
@@ -126,7 +136,11 @@ __kernel void LUT(__global const uchar * srcptr, int src_step, int src_offset,
     __local dstT lut_l[256 * lcn];
     LOCAL_LUT_INIT;
 
+#ifdef USE_ALIGNED
+    int x = 4 * get_global_id(0);
+#else
     int x = get_global_id(0);
+#endif
     int y = 4 * get_global_id(1);
 
     if (x < cols && y < rows)

From 5d924b7a75d9f93aa57371474ad6e40035c41ea7 Mon Sep 17 00:00:00 2001
From: VBystricky <vladimir.bystricky@stdutility.com>
Date: Thu, 5 Jun 2014 19:31:31 +0400
Subject: [PATCH 9/9] If lut table has one channel and src aligned to 4, work
 with src as with one channel matrix

---
 modules/core/src/convert.cpp   | 18 +++++++++---------
 modules/core/src/opencl/lut.cl | 22 ++++------------------
 2 files changed, 13 insertions(+), 27 deletions(-)

diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp
index 162eaacb9..49eb93a79 100644
--- a/modules/core/src/convert.cpp
+++ b/modules/core/src/convert.cpp
@@ -1548,22 +1548,22 @@ static bool ocl_LUT(InputArray _src, InputArray _lut, OutputArray _dst)
     UMat src = _src.getUMat(), lut = _lut.getUMat();
     _dst.create(src.size(), CV_MAKETYPE(ddepth, dcn));
     UMat dst = _dst.getUMat();
-    bool bAligned = (1 == dcn) && (0 == (src.offset % 4)) && (0 == (src.cols % 4));
+    bool bAligned = (1 == lcn) && (0 == (src.offset % 4)) && (0 == ((dcn * src.cols) % 4));
+    // dst.cols == src.cols by params of dst.create
 
     ocl::Kernel k("LUT", ocl::core::lut_oclsrc,
-                  format("-D dcn=%d -D lcn=%d -D srcT=%s -D dstT=%s%s", dcn, lcn,
-                         ocl::typeToStr(src.depth()), ocl::memopTypeToStr(ddepth),
-                         bAligned ? " -D USE_ALIGNED" : ""
+                  format("-D dcn=%d -D lcn=%d -D srcT=%s -D dstT=%s", bAligned ? 4 : dcn, lcn,
+                         ocl::typeToStr(src.depth()), ocl::memopTypeToStr(ddepth)
                          ));
     if (k.empty())
         return false;
 
-    k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::ReadOnlyNoSize(lut),
-           ocl::KernelArg::WriteOnly(dst));
+    int cols = bAligned ? dcn * dst.cols / 4 : dst.cols;
 
-    size_t globalSize[2] = { dst.cols, (dst.rows + 3) / 4};
-    if (bAligned)
-        globalSize[0] = (dst.cols + 3) / 4;
+    k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::ReadOnlyNoSize(lut),
+        ocl::KernelArg::WriteOnlyNoSize(dst), dst.rows, cols);
+
+    size_t globalSize[2] = { cols, (dst.rows + 3) / 4 };
     return k.run(2, globalSize, NULL, false);
 }
 
diff --git a/modules/core/src/opencl/lut.cl b/modules/core/src/opencl/lut.cl
index 295f0ae71..a33d50c6f 100644
--- a/modules/core/src/opencl/lut.cl
+++ b/modules/core/src/opencl/lut.cl
@@ -57,20 +57,10 @@
             dst[0] = lut_l[idx->x];\
             dst[1] = lut_l[idx->y];
     #elif dcn == 1
-        #ifdef USE_ALIGNED
-            #define LUT_OP(num)\
-                int idx = *(__global const int *)(srcptr + mad24(num, src_step, src_index));\
-                dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
-                dst[0] = lut_l[idx & 0xff];\
-                dst[1] = lut_l[(idx >> 8) & 0xff];\
-                dst[2] = lut_l[(idx >> 16) & 0xff];\
-                dst[3] = lut_l[(idx >> 24) & 0xff];
-        #else
-            #define LUT_OP(num)\
-                uchar idx = (srcptr + mad24(num, src_step, src_index))[0];\
-                dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
-                dst[0] = lut_l[idx];
-        #endif
+        #define LUT_OP(num)\
+            uchar idx = (srcptr + mad24(num, src_step, src_index))[0];\
+            dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
+            dst[0] = lut_l[idx];
     #else
         #define LUT_OP(num)\
             src = (__global const srcT *)(srcptr + mad24(num, src_step, src_index));\
@@ -136,11 +126,7 @@ __kernel void LUT(__global const uchar * srcptr, int src_step, int src_offset,
     __local dstT lut_l[256 * lcn];
     LOCAL_LUT_INIT;
 
-#ifdef USE_ALIGNED
-    int x = 4 * get_global_id(0);
-#else
     int x = get_global_id(0);
-#endif
     int y = 4 * get_global_id(1);
 
     if (x < cols && y < rows)