diff --git a/modules/core/include/opencv2/core/ocl.hpp b/modules/core/include/opencv2/core/ocl.hpp
index 311276679..e3805bcdc 100644
--- a/modules/core/include/opencv2/core/ocl.hpp
+++ b/modules/core/include/opencv2/core/ocl.hpp
@@ -489,6 +489,7 @@ public:
     bool runTask(bool sync, const Queue& q=Queue());
 
     size_t workGroupSize() const;
+    size_t preferedWorkGroupSizeMultiple() const;
     bool compileWorkGroupSize(size_t wsz[]) const;
     size_t localMemSize() const;
 
diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp
index 9b7564250..cf3b1dcab 100644
--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@@ -2817,6 +2817,16 @@ size_t Kernel::workGroupSize() const
                                     sizeof(val), &val, &retsz) >= 0 ? val : 0;
 }
 
+size_t Kernel::preferedWorkGroupSizeMultiple() const
+{
+    if(!p)
+        return 0;
+    size_t val = 0, retsz = 0;
+    cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
+    return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
+                                    sizeof(val), &val, &retsz) >= 0 ? val : 0;
+}
+
 bool Kernel::compileWorkGroupSize(size_t wsz[]) const
 {
     if(!p || !wsz)
diff --git a/modules/imgproc/src/clahe.cpp b/modules/imgproc/src/clahe.cpp
index 89fb62bd0..079e635f9 100644
--- a/modules/imgproc/src/clahe.cpp
+++ b/modules/imgproc/src/clahe.cpp
@@ -40,10 +40,90 @@
 //M*/
 
 #include "precomp.hpp"
+#include "opencl_kernels.hpp"
 
 // ----------------------------------------------------------------------
 // CLAHE
 
+namespace clahe
+{
+    static bool calcLut(cv::InputArray _src, cv::OutputArray _dst,
+        const int tilesX, const int tilesY, const cv::Size tileSize,
+        const int clipLimit, const float lutScale)
+    {
+        cv::ocl::Kernel _k("calcLut", cv::ocl::imgproc::clahe_oclsrc);
+
+        bool is_cpu = cv::ocl::Device::getDefault().type() == cv::ocl::Device::TYPE_CPU;
+        cv::String opts;
+        if(is_cpu)
+            opts = "-D CPU ";
+        else
+            opts = cv::format("-D WAVE_SIZE=%d", _k.preferedWorkGroupSizeMultiple());
+
+        cv::ocl::Kernel k("calcLut", cv::ocl::imgproc::clahe_oclsrc, opts);
+        if(k.empty())
+            return false;
+
+        cv::UMat src = _src.getUMat();
+        _dst.create(tilesX * tilesY, 256, CV_8UC1);
+        cv::UMat dst = _dst.getUMat();
+
+        int tile_size[2];
+        tile_size[0] = tileSize.width;
+        tile_size[1] = tileSize.height;
+
+        size_t localThreads[3]  = { 32, 8, 1 };
+        size_t globalThreads[3] = { tilesX * localThreads[0], tilesY * localThreads[1], 1 };
+
+        int idx = 0;
+        idx = k.set(idx, cv::ocl::KernelArg::ReadOnlyNoSize(src));
+        idx = k.set(idx, cv::ocl::KernelArg::WriteOnlyNoSize(dst));
+        idx = k.set(idx, tile_size);
+        idx = k.set(idx, tilesX);
+        idx = k.set(idx, clipLimit);
+        idx = k.set(idx, lutScale);
+
+        if (!k.run(2, globalThreads, localThreads, false))
+            return false;
+        return true;
+    }
+
+    static bool transform(const cv::InputArray _src, cv::OutputArray _dst, const cv::InputArray _lut,
+        const int tilesX, const int tilesY, const cv::Size & tileSize)
+    {
+
+        cv::ocl::Kernel k("transform", cv::ocl::imgproc::clahe_oclsrc);
+        if(k.empty())
+            return false;
+
+        int tile_size[2];
+        tile_size[0] = tileSize.width;
+        tile_size[1] = tileSize.height;
+
+        cv::UMat src = _src.getUMat();
+        _dst.create(src.size(), src.type());
+        cv::UMat dst = _dst.getUMat();
+        cv::UMat lut = _lut.getUMat();
+
+        size_t localThreads[3]  = { 32, 8, 1 };
+        size_t globalThreads[3] = { src.cols, src.rows, 1 };
+
+        int idx = 0;
+        idx = k.set(idx, cv::ocl::KernelArg::ReadOnlyNoSize(src));
+        idx = k.set(idx, cv::ocl::KernelArg::WriteOnlyNoSize(dst));
+        idx = k.set(idx, cv::ocl::KernelArg::ReadOnlyNoSize(lut));
+        idx = k.set(idx, src.cols);
+        idx = k.set(idx, src.rows);
+        idx = k.set(idx, tile_size);
+        idx = k.set(idx, tilesX);
+        idx = k.set(idx, tilesY);
+
+        if (!k.run(2, globalThreads, localThreads, false))
+            return false;
+        return true;
+    }
+}
+
 namespace
 {
     class CLAHE_CalcLut_Body : public cv::ParallelLoopBody
@@ -241,7 +321,9 @@ namespace
         int tilesY_;
 
         cv::Mat srcExt_;
+        cv::UMat usrcExt_;
         cv::Mat lut_;
+        cv::UMat ulut_;
     };
 
     CLAHE_Impl::CLAHE_Impl(double clipLimit, int tilesX, int tilesY) :
@@ -256,31 +338,34 @@ namespace
 
     void CLAHE_Impl::apply(cv::InputArray _src, cv::OutputArray _dst)
     {
-        cv::Mat src = _src.getMat();
+        CV_Assert( _src.type() == CV_8UC1 );
 
-        CV_Assert( src.type() == CV_8UC1 );
-
-        _dst.create( src.size(), src.type() );
-        cv::Mat dst = _dst.getMat();
+        bool useOpenCL = cv::ocl::useOpenCL() && _src.isUMat() && _src.dims()<=2;
 
         const int histSize = 256;
 
-        lut_.create(tilesX_ * tilesY_, histSize, CV_8UC1);
-
         cv::Size tileSize;
-        cv::Mat srcForLut;
+        cv::_InputArray _srcForLut;
 
-        if (src.cols % tilesX_ == 0 && src.rows % tilesY_ == 0)
+        if (_src.size().width % tilesX_ == 0 && _src.size().height % tilesY_ == 0)
         {
-            tileSize = cv::Size(src.cols / tilesX_, src.rows / tilesY_);
-            srcForLut = src;
+            tileSize = cv::Size(_src.size().width / tilesX_, _src.size().height / tilesY_);
+            _srcForLut = _src;
         }
         else
         {
-            cv::copyMakeBorder(src, srcExt_, 0, tilesY_ - (src.rows % tilesY_), 0, tilesX_ - (src.cols % tilesX_), cv::BORDER_REFLECT_101);
-
-            tileSize = cv::Size(srcExt_.cols / tilesX_, srcExt_.rows / tilesY_);
-            srcForLut = srcExt_;
+            if(useOpenCL)
+            {
+                cv::copyMakeBorder(_src, usrcExt_, 0, tilesY_ - (_src.size().height % tilesY_), 0, tilesX_ - (_src.size().width % tilesX_), cv::BORDER_REFLECT_101);
+                tileSize = cv::Size(usrcExt_.size().width / tilesX_, usrcExt_.size().height / tilesY_);
+                _srcForLut = usrcExt_;
+            }
+            else
+            {
+                cv::copyMakeBorder(_src, srcExt_, 0, tilesY_ - (_src.size().height % tilesY_), 0, tilesX_ - (_src.size().width % tilesX_), cv::BORDER_REFLECT_101);
+                tileSize = cv::Size(srcExt_.size().width / tilesX_, srcExt_.size().height / tilesY_);
+                _srcForLut = srcExt_;
+            }
         }
 
         const int tileSizeTotal = tileSize.area();
@@ -293,6 +378,16 @@ namespace
             clipLimit = std::max(clipLimit, 1);
         }
 
+        if(useOpenCL && clahe::calcLut(_srcForLut, ulut_, tilesX_, tilesY_, tileSize, clipLimit, lutScale) )
+            if( clahe::transform(_src, _dst, ulut_, tilesX_, tilesY_, tileSize) )
+                return;
+
+        cv::Mat src = _src.getMat();
+        _dst.create( src.size(), src.type() );
+        cv::Mat dst = _dst.getMat();
+        cv::Mat srcForLut = _srcForLut.getMat();
+        lut_.create(tilesX_ * tilesY_, histSize, CV_8UC1);
+
         CLAHE_CalcLut_Body calcLutBody(srcForLut, lut_, tileSize, tilesX_, tilesY_, clipLimit, lutScale);
         cv::parallel_for_(cv::Range(0, tilesX_ * tilesY_), calcLutBody);
 
@@ -325,6 +420,8 @@ namespace
     {
         srcExt_.release();
         lut_.release();
+        usrcExt_.release();
+        ulut_.release();
     }
 }
 
diff --git a/modules/imgproc/src/opencl/clahe.cl b/modules/imgproc/src/opencl/clahe.cl
new file mode 100644
index 000000000..9f88b20bf
--- /dev/null
+++ b/modules/imgproc/src/opencl/clahe.cl
@@ -0,0 +1,252 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Sen Liu, swjtuls1987@126.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef WAVE_SIZE
+#define WAVE_SIZE 1
+#endif
+
+inline int calc_lut(__local int* smem, int val, int tid)
+{
+    smem[tid] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid == 0)
+        for (int i = 1; i < 256; ++i)
+            smem[i] += smem[i - 1];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    return smem[tid];
+}
+
+#ifdef CPU
+inline void reduce(volatile __local int* smem, int val, int tid)
+{
+    smem[tid] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 128)
+        smem[tid] = val += smem[tid + 128];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 64)
+        smem[tid] = val += smem[tid + 64];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 32)
+        smem[tid] += smem[tid + 32];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 16)
+        smem[tid] += smem[tid + 16];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 8)
+        smem[tid] += smem[tid + 8];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 4)
+        smem[tid] += smem[tid + 4];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 2)
+        smem[tid] += smem[tid + 2];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 1)
+        smem[256] = smem[tid] + smem[tid + 1];
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+#else
+
+inline void reduce(__local volatile int* smem, int val, int tid)
+{
+    smem[tid] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 128)
+        smem[tid] = val += smem[tid + 128];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 64)
+        smem[tid] = val += smem[tid + 64];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 32)
+    {
+        smem[tid] += smem[tid + 32];
+#if WAVE_SIZE < 32
+    } barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 16)
+    {
+#endif
+        smem[tid] += smem[tid + 16];
+#if WAVE_SIZE < 16
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 8)
+    {
+#endif
+        smem[tid] += smem[tid + 8];
+        smem[tid] += smem[tid + 4];
+        smem[tid] += smem[tid + 2];
+        smem[tid] += smem[tid + 1];
+    }
+}
+#endif
+
+__kernel void calcLut(__global __const uchar * src, const int srcStep,
+                      const int src_offset, __global uchar * lut,
+                      const int dstStep, const int dst_offset,
+                      const int2 tileSize, const int tilesX,
+                      const int clipLimit, const float lutScale)
+{
+    __local int smem[512];
+
+    int tx = get_group_id(0);
+    int ty = get_group_id(1);
+    int tid = get_local_id(1) * get_local_size(0)
+                             + get_local_id(0);
+    smem[tid] = 0;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for (int i = get_local_id(1); i < tileSize.y; i += get_local_size(1))
+    {
+        __global const uchar* srcPtr = src + mad24(ty * tileSize.y + i, srcStep, tx * tileSize.x + src_offset);
+        for (int j = get_local_id(0); j < tileSize.x; j += get_local_size(0))
+        {
+            const int data = srcPtr[j];
+            atomic_inc(&smem[data]);
+        }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int tHistVal = smem[tid];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (clipLimit > 0)
+    {
+        // clip histogram bar
+        int clipped = 0;
+        if (tHistVal > clipLimit)
+        {
+            clipped = tHistVal - clipLimit;
+            tHistVal = clipLimit;
+        }
+
+        // find number of overall clipped samples
+        reduce(smem, clipped, tid);
+        barrier(CLK_LOCAL_MEM_FENCE);
+#ifdef CPU
+        clipped = smem[256];
+#else
+        clipped = smem[0];
+#endif
+
+        // broadcast evaluated value
+
+        __local int totalClipped;
+
+        if (tid == 0)
+            totalClipped = clipped;
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        // redistribute clipped samples evenly
+
+        int redistBatch = totalClipped / 256;
+        tHistVal += redistBatch;
+
+        int residual = totalClipped - redistBatch * 256;
+        if (tid < residual)
+            ++tHistVal;
+    }
+
+    const int lutVal = calc_lut(smem, tHistVal, tid);
+    uint ires = (uint)convert_int_rte(lutScale * lutVal);
+    lut[(ty * tilesX + tx) * dstStep + tid + dst_offset] =
+        convert_uchar(clamp(ires, (uint)0, (uint)255));
+}
+
+__kernel void transform(__global __const uchar * src, const int srcStep, const int src_offset,
+                        __global uchar * dst, const int dstStep, const int dst_offset,
+                        __global uchar * lut, const int lutStep, int lut_offset,
+                        const int cols, const int rows,
+                        const int2 tileSize,
+                        const int tilesX, const int tilesY)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    if (x >= cols || y >= rows)
+        return;
+
+    const float tyf = (convert_float(y) / tileSize.y) - 0.5f;
+    int ty1 = convert_int_rtn(tyf);
+    int ty2 = ty1 + 1;
+    const float ya = tyf - ty1;
+    ty1 = max(ty1, 0);
+    ty2 = min(ty2, tilesY - 1);
+
+    const float txf = (convert_float(x) / tileSize.x) - 0.5f;
+    int tx1 = convert_int_rtn(txf);
+    int tx2 = tx1 + 1;
+    const float xa = txf - tx1;
+    tx1 = max(tx1, 0);
+    tx2 = min(tx2, tilesX - 1);
+
+    const int srcVal = src[mad24(y, srcStep, x + src_offset)];
+
+    float res = 0;
+
+    res += lut[mad24(ty1 * tilesX + tx1, lutStep, srcVal + lut_offset)] * ((1.0f - xa) * (1.0f - ya));
+    res += lut[mad24(ty1 * tilesX + tx2, lutStep, srcVal + lut_offset)] * ((xa) * (1.0f - ya));
+    res += lut[mad24(ty2 * tilesX + tx1, lutStep, srcVal + lut_offset)] * ((1.0f - xa) * (ya));
+    res += lut[mad24(ty2 * tilesX + tx2, lutStep, srcVal + lut_offset)] * ((xa) * (ya));
+
+    uint ires = (uint)convert_int_rte(res);
+    dst[mad24(y, dstStep, x + dst_offset)] = convert_uchar(clamp(ires, (uint)0, (uint)255));
+}