diff --git a/modules/imgproc/doc/miscellaneous_transformations.rst b/modules/imgproc/doc/miscellaneous_transformations.rst
index 4ebf6d5ee..9fd8df517 100644
--- a/modules/imgproc/doc/miscellaneous_transformations.rst
+++ b/modules/imgproc/doc/miscellaneous_transformations.rst
@@ -116,6 +116,7 @@ If you use ``cvtColor`` with 8-bit images, the conversion will have some informa
 The function can do the following transformations:
 
 *
+    RGB :math:`\leftrightarrow` GRAY ( ``CV_BGR2GRAY, CV_RGB2GRAY, CV_GRAY2BGR, CV_GRAY2RGB``     )
     Transformations within RGB space like adding/removing the alpha channel, reversing the channel order, conversion to/from 16-bit RGB color (R5:G6:B5 or R5:G5:B5), as well as conversion to/from grayscale using:
 
     .. math::
@@ -765,7 +766,7 @@ Runs the GrabCut algorithm.
 
         * **GC_PR_BGD** defines a possible background pixel.
 
-        * **GC_PR_BGD** defines a possible foreground pixel.
+        * **GC_PR_FGD** defines a possible foreground pixel.
 
     :param rect: ROI containing a segmented object. The pixels outside of the ROI are marked as "obvious background". The parameter is only used when  ``mode==GC_INIT_WITH_RECT`` .
 
diff --git a/modules/java/generator/src/java/android+CameraBridgeViewBase.java b/modules/java/generator/src/java/android+CameraBridgeViewBase.java
index 6c5c3294f..b15ae2bd8 100644
--- a/modules/java/generator/src/java/android+CameraBridgeViewBase.java
+++ b/modules/java/generator/src/java/android+CameraBridgeViewBase.java
@@ -80,6 +80,14 @@ public abstract class CameraBridgeViewBase extends SurfaceView implements Surfac
         mMaxHeight = MAX_UNSPECIFIED;
         styledAttrs.recycle();
     }
+    
+    /**
+     * Sets the camera index
+     * @param camera index
+     */
+    public void setCameraIndex(int cameraIndex) {
+        this.mCameraIndex = cameraIndex;
+    }
 
     public interface CvCameraViewListener {
         /**
diff --git a/modules/ocl/include/opencv2/ocl/private/util.hpp b/modules/ocl/include/opencv2/ocl/private/util.hpp
index 23a3ad468..634f2f2b1 100644
--- a/modules/ocl/include/opencv2/ocl/private/util.hpp
+++ b/modules/ocl/include/opencv2/ocl/private/util.hpp
@@ -49,7 +49,7 @@
 #include "opencv2/ocl/ocl.hpp"
 
 #if defined __APPLE__
-#include <OpenCL/OpenCL.h>
+#include <OpenCL/opencl.h>
 #else
 #include <CL/opencl.h>
 #endif
diff --git a/modules/ocl/src/filtering.cpp b/modules/ocl/src/filtering.cpp
index cc07209b1..56a70ae53 100644
--- a/modules/ocl/src/filtering.cpp
+++ b/modules/ocl/src/filtering.cpp
@@ -356,8 +356,7 @@ static void GPUDilate(const oclMat &src, oclMat &dst, oclMat &mat_kernel,
     char compile_option[128];
     sprintf(compile_option, "-D RADIUSX=%d -D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D DILATE %s %s", 
         anchor.x, anchor.y, (int)localThreads[0], (int)localThreads[1], 
-        rectKernel?"-D RECTKERNEL":"",
-        s);
+        s, rectKernel?"-D RECTKERNEL":"");
     vector< pair<size_t, const void *> > args;
     args.push_back(make_pair(sizeof(cl_mem), (void *)&src.data));
     args.push_back(make_pair(sizeof(cl_mem), (void *)&dst.data));
diff --git a/modules/ocl/src/hog.cpp b/modules/ocl/src/hog.cpp
index 9c8f315ec..a3514586f 100644
--- a/modules/ocl/src/hog.cpp
+++ b/modules/ocl/src/hog.cpp
@@ -1578,7 +1578,9 @@ static void openCLExecuteKernel_hog(Context *clCxt , const char **source, string
                                     size_t globalThreads[3], size_t localThreads[3], 
                                     vector< pair<size_t, const void *> > &args)
 {
-    size_t wave_size = queryDeviceInfo<WAVEFRONT_SIZE, size_t>();
+    cl_kernel kernel = openCLGetKernelFromSource(clCxt, source, kernelName);
+    size_t wave_size = queryDeviceInfo<WAVEFRONT_SIZE, size_t>(kernel);
+    openCLSafeCall(clReleaseKernel(kernel));
     if (wave_size <= 16)
     {
         char build_options[64];
diff --git a/modules/ocl/src/mcwutil.cpp b/modules/ocl/src/mcwutil.cpp
index b1f8eebf6..75314fb49 100644
--- a/modules/ocl/src/mcwutil.cpp
+++ b/modules/ocl/src/mcwutil.cpp
@@ -43,9 +43,28 @@
 //
 //M*/
 
-#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
 #include "precomp.hpp"
 
+#ifdef __GNUC__
+#if ((__GNUC__ * 100) + __GNUC_MINOR__) >= 402
+#define GCC_DIAG_STR(s) #s
+#define GCC_DIAG_JOINSTR(x,y) GCC_DIAG_STR(x ## y)
+# define GCC_DIAG_DO_PRAGMA(x) _Pragma (#x)
+# define GCC_DIAG_PRAGMA(x) GCC_DIAG_DO_PRAGMA(GCC diagnostic x)
+# if ((__GNUC__ * 100) + __GNUC_MINOR__) >= 406
+#  define GCC_DIAG_OFF(x) GCC_DIAG_PRAGMA(push) \
+GCC_DIAG_PRAGMA(ignored GCC_DIAG_JOINSTR(-W,x))
+#  define GCC_DIAG_ON(x) GCC_DIAG_PRAGMA(pop)
+# else
+#  define GCC_DIAG_OFF(x) GCC_DIAG_PRAGMA(ignored GCC_DIAG_JOINSTR(-W,x))
+#  define GCC_DIAG_ON(x)  GCC_DIAG_PRAGMA(warning GCC_DIAG_JOINSTR(-W,x))
+# endif
+#else
+# define GCC_DIAG_OFF(x)
+# define GCC_DIAG_ON(x)
+#endif
+#endif /* __GNUC__ */
+
 using namespace std;
 
 namespace cv
@@ -121,6 +140,9 @@ namespace cv
                                   build_options, finish_mode);
         }
 
+#ifdef __GNUC__
+        GCC_DIAG_OFF(deprecated-declarations)
+#endif
         cl_mem bindTexture(const oclMat &mat)
         {
             cl_mem texture;
@@ -180,10 +202,6 @@ namespace cv
             else
 #endif
             {
-#ifdef __GNUC__
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-#endif
                 texture = clCreateImage2D(
                     (cl_context)mat.clCxt->oclContext(),
                     CL_MEM_READ_WRITE,
@@ -193,9 +211,6 @@ namespace cv
                     0,
                     NULL,
                     &err);
-#ifdef __GNUC__
-#pragma GCC diagnostic pop
-#endif
             }
             size_t origin[] = { 0, 0, 0 };
             size_t region[] = { mat.cols, mat.rows, 1 };
@@ -225,11 +240,14 @@ namespace cv
             openCLSafeCall(err);
             return texture;
         }
+#ifdef __GNUC__
+        GCC_DIAG_ON(deprecated-declarations)
+#endif
+
         Ptr<TextureCL> bindTexturePtr(const oclMat &mat)
         {
             return Ptr<TextureCL>(new TextureCL(bindTexture(mat), mat.rows, mat.cols, mat.type()));
         }
-
         void releaseTexture(cl_mem& texture)
         {
             openCLFree(texture);
diff --git a/modules/ocl/src/opencl/arithm_add.cl b/modules/ocl/src/opencl/arithm_add.cl
index 7d4b0a765..070ced473 100644
--- a/modules/ocl/src/opencl/arithm_add.cl
+++ b/modules/ocl/src/opencl/arithm_add.cl
@@ -127,7 +127,7 @@ __kernel void arithm_add_D2 (__global ushort *src1, int src1_step, int src1_offs
 #ifdef dst_align
 #undef dst_align
 #endif
-#define dst_align ((dst_offset >> 1) & 3)
+#define dst_align ((dst_offset / 2) & 3)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
 
@@ -165,7 +165,7 @@ __kernel void arithm_add_D3 (__global short *src1, int src1_step, int src1_offse
 #ifdef dst_align
 #undef dst_align
 #endif
-#define dst_align ((dst_offset >> 1) & 3)
+#define dst_align ((dst_offset / 2) & 3)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
 
@@ -335,7 +335,7 @@ __kernel void arithm_add_with_mask_C1_D2 (__global ushort *src1, int src1_step,
 #ifdef dst_align
 #undef dst_align
 #endif
-#define dst_align ((dst_offset >> 1) & 1)
+#define dst_align ((dst_offset / 2) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -375,7 +375,7 @@ __kernel void arithm_add_with_mask_C1_D3 (__global short *src1, int src1_step, i
 #ifdef dst_align
 #undef dst_align
 #endif
-#define dst_align ((dst_offset >> 1) & 1)
+#define dst_align ((dst_offset / 2) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -507,7 +507,7 @@ __kernel void arithm_add_with_mask_C2_D0 (__global uchar *src1, int src1_step, i
 #ifdef dst_align
 #undef dst_align
 #endif
-#define dst_align ((dst_offset >> 1) & 1)
+#define dst_align ((dst_offset / 2) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
diff --git a/modules/ocl/src/opencl/arithm_add_scalar_mask.cl b/modules/ocl/src/opencl/arithm_add_scalar_mask.cl
index fdf65923c..3dbd376ec 100644
--- a/modules/ocl/src/opencl/arithm_add_scalar_mask.cl
+++ b/modules/ocl/src/opencl/arithm_add_scalar_mask.cl
@@ -126,7 +126,7 @@ __kernel void arithm_s_add_with_mask_C1_D2 (__global   ushort *src1, int src1_st
 #ifdef dst_align
 #undef dst_align
 #endif
-#define dst_align ((dst_offset >> 1) & 1)
+#define dst_align ((dst_offset / 2) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
 
@@ -164,7 +164,7 @@ __kernel void arithm_s_add_with_mask_C1_D3 (__global   short *src1, int src1_ste
 #ifdef dst_align
 #undef dst_align
 #endif
-#define dst_align ((dst_offset >> 1) & 1)
+#define dst_align ((dst_offset / 2) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
 
@@ -288,7 +288,7 @@ __kernel void arithm_s_add_with_mask_C2_D0 (__global   uchar *src1, int src1_ste
 #ifdef dst_align
 #undef dst_align
 #endif
-#define dst_align ((dst_offset >> 1) & 1)
+#define dst_align ((dst_offset / 2) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
 
diff --git a/modules/ocl/src/opencl/arithm_mul.cl b/modules/ocl/src/opencl/arithm_mul.cl
index e1cc9f6ab..40988f5fe 100644
--- a/modules/ocl/src/opencl/arithm_mul.cl
+++ b/modules/ocl/src/opencl/arithm_mul.cl
@@ -277,9 +277,15 @@ __kernel void arithm_mul_D6 (__global double *src1, int src1_step, int src1_offs
 }
 #endif
 
+#ifdef DOUBLE_SUPPORT
+#define SCALAR_TYPE double
+#else
+#define SCALAR_TYPE float
+#endif
+
 __kernel void arithm_muls_D5 (__global float *src1, int src1_step, int src1_offset,
                               __global float *dst,  int dst_step,  int dst_offset,
-                              int rows, int cols, int dst_step1, float scalar)
+                              int rows, int cols, int dst_step1, SCALAR_TYPE scalar)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
diff --git a/modules/ocl/src/opencl/filtering_morph.cl b/modules/ocl/src/opencl/filtering_morph.cl
index 49640008f..e659a59f5 100644
--- a/modules/ocl/src/opencl/filtering_morph.cl
+++ b/modules/ocl/src/opencl/filtering_morph.cl
@@ -120,7 +120,7 @@ __kernel void morph_C1_D0(__global const uchar * restrict src,
     int gidy = get_global_id(1);
     int out_addr = mad24(gidy,dst_step_in_pixel,gidx+dst_offset_in_pixel);
 
-    if(gidx+3<cols && gidy<rows && (dst_offset_in_pixel&3)==0)
+    if(gidx+3<cols && gidy<rows && ((dst_offset_in_pixel&3)==0))
     {
         *(__global uchar4*)&dst[out_addr] = res;
     }
diff --git a/modules/ocl/src/opencl/imgproc_threshold.cl b/modules/ocl/src/opencl/imgproc_threshold.cl
index 8ad501f7c..9162abb7e 100644
--- a/modules/ocl/src/opencl/imgproc_threshold.cl
+++ b/modules/ocl/src/opencl/imgproc_threshold.cl
@@ -143,7 +143,7 @@ __kernel void threshold_C1_D5(__global const float * restrict src, __global floa
         int4 dpos = (int4)(dstart, dstart+1, dstart+2, dstart+3);
         float4 dVal = *(__global float4*)(dst+dst_offset+gy*dst_step+dstart);
         int4 con = dpos >= 0 && dpos < dst_cols;
-        ddata = convert_float4(con) != 0 ? ddata : dVal;
+        ddata = convert_float4(con) != (float4)(0) ? ddata : dVal;
         if(dstart < dst_cols)
         {
             *(__global float4*)(dst+dst_offset+gy*dst_step+dstart) = ddata;
diff --git a/modules/ocl/src/opencl/pyrlk.cl b/modules/ocl/src/opencl/pyrlk.cl
index 1043b8410..40a199395 100644
--- a/modules/ocl/src/opencl/pyrlk.cl
+++ b/modules/ocl/src/opencl/pyrlk.cl
@@ -46,145 +46,10 @@
 
 //#pragma OPENCL EXTENSION cl_amd_printf : enable
 
-__kernel void calcSharrDeriv_vertical_C1_D0(__global const uchar* src, int srcStep, int rows, int cols, int cn, __global short* dx_buf, int dx_bufStep, __global short* dy_buf, int dy_bufStep)
-{
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-
-    if (y < rows && x < cols * cn)
-    {
-        const uchar src_val0 = (src + (y > 0 ? y-1 : rows > 1 ? 1 : 0) * srcStep)[x];
-        const uchar src_val1 = (src + y * srcStep)[x];
-        const uchar src_val2 = (src + (y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0) * srcStep)[x];
-
-        ((__global short*)((__global char*)dx_buf + y * dx_bufStep / 2))[x] = (src_val0 + src_val2) * 3 + src_val1 * 10;
-        ((__global short*)((__global char*)dy_buf + y * dy_bufStep / 2))[x] = src_val2 - src_val0;
-    }
-}
-
-__kernel void calcSharrDeriv_vertical_C4_D0(__global const uchar* src, int srcStep, int rows, int cols, int cn, __global short* dx_buf, int dx_bufStep, __global short* dy_buf, int dy_bufStep)
-{
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-
-    if (y < rows && x < cols * cn)
-    {
-        const uchar src_val0 = (src + (y > 0 ? y - 1 : 1) * srcStep)[x];
-        const uchar src_val1 = (src + y * srcStep)[x];
-        const uchar src_val2 = (src + (y < rows - 1 ? y + 1 : rows - 2) * srcStep)[x];
-
-        ((__global short*)((__global char*)dx_buf + y * dx_bufStep / 2))[x] = (src_val0 + src_val2) * 3 + src_val1 * 10;
-        ((__global short*)((__global char*)dy_buf + y * dy_bufStep / 2))[x] = src_val2 - src_val0;
-    }
-}
-
-__kernel void calcSharrDeriv_horizontal_C1_D0(int rows, int cols, int cn, __global const short* dx_buf, int dx_bufStep, __global const short* dy_buf, int dy_bufStep, __global short* dIdx, int dIdxStep, __global short* dIdy, int dIdyStep)
-{
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-
-    const int colsn = cols * cn;
-
-    if (y < rows && x < colsn)
-    {
-        __global const short* dx_buf_row = dx_buf + y * dx_bufStep;
-        __global const short* dy_buf_row = dy_buf + y * dy_bufStep;
-
-        const int xr = x + cn < colsn ? x + cn : (cols - 2) * cn + x + cn - colsn;
-        const int xl = x - cn >= 0 ? x - cn : cn + x;
-
-        ((__global short*)((__global char*)dIdx + y * dIdxStep / 2))[x] = dx_buf_row[xr] - dx_buf_row[xl];
-        ((__global short*)((__global char*)dIdy + y * dIdyStep / 2))[x] = (dy_buf_row[xr] + dy_buf_row[xl]) * 3 + dy_buf_row[x] * 10;
-    }
-}
-
-__kernel void calcSharrDeriv_horizontal_C4_D0(int rows, int cols, int cn, __global const short* dx_buf, int dx_bufStep, __global const short* dy_buf, int dy_bufStep, __global short* dIdx, int dIdxStep, __global short* dIdy, int dIdyStep)
-{
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-
-    const int colsn = cols * cn;
-
-    if (y < rows && x < colsn)
-    {
-        __global const short* dx_buf_row = dx_buf + y * dx_bufStep;
-        __global const short* dy_buf_row = dy_buf + y * dy_bufStep;
-
-        const int xr = x + cn < colsn ? x + cn : (cols - 2) * cn + x + cn - colsn;
-        const int xl = x - cn >= 0 ? x - cn : cn + x;
-
-        ((__global short*)((__global char*)dIdx + y * dIdxStep / 2))[x] = dx_buf_row[xr] - dx_buf_row[xl];
-        ((__global short*)((__global char*)dIdy + y * dIdyStep / 2))[x] = (dy_buf_row[xr] + dy_buf_row[xl]) * 3 + dy_buf_row[x] * 10;
-    }
-}
-
-#define W_BITS 14
-#define W_BITS1 14
-
-#define  CV_DESCALE(x, n)     (((x) + (1 << ((n)-1))) >> (n))
-
-int linearFilter_uchar(__global const uchar* src, int srcStep, int cn, float2 pt, int x, int y)
-{
-    int2 ipt;
-    ipt.x = convert_int_sat_rtn(pt.x);
-    ipt.y = convert_int_sat_rtn(pt.y);
-
-    float a = pt.x - ipt.x;
-    float b = pt.y - ipt.y;
-
-    int iw00 = convert_int_sat_rte((1.0f - a) * (1.0f - b) * (1 << W_BITS));
-    int iw01 = convert_int_sat_rte(a * (1.0f - b) * (1 << W_BITS));
-    int iw10 = convert_int_sat_rte((1.0f - a) * b * (1 << W_BITS));
-    int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
-
-    __global const uchar* src_row = src + (ipt.y + y) * srcStep + ipt.x * cn;
-    __global const uchar* src_row1 = src + (ipt.y + y + 1) * srcStep + ipt.x * cn;
-
-    return CV_DESCALE(src_row[x] * iw00 + src_row[x + cn] * iw01 + src_row1[x] * iw10 + src_row1[x + cn] * iw11, W_BITS1 - 5);
-}
-
-int linearFilter_short(__global const short* src, int srcStep, int cn, float2 pt, int x, int y)
-{
-    int2 ipt;
-    ipt.x = convert_int_sat_rtn(pt.x);
-    ipt.y = convert_int_sat_rtn(pt.y);
-
-    float a = pt.x - ipt.x;
-    float b = pt.y - ipt.y;
-
-    int iw00 = convert_int_sat_rte((1.0f - a) * (1.0f - b) * (1 << W_BITS));
-    int iw01 = convert_int_sat_rte(a * (1.0f - b) * (1 << W_BITS));
-    int iw10 = convert_int_sat_rte((1.0f - a) * b * (1 << W_BITS));
-    int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
-
-    __global const short* src_row = src + (ipt.y + y) * srcStep + ipt.x * cn;
-    __global const short* src_row1 = src + (ipt.y + y + 1) * srcStep + ipt.x * cn;
-
-    return CV_DESCALE(src_row[x] * iw00 + src_row[x + cn] * iw01 + src_row1[x] * iw10 + src_row1[x + cn] * iw11, W_BITS1);
-}
-
-float linearFilter_float(__global const float* src, int srcStep, int cn, float2 pt, float x, float y)
-{
-    int2 ipt;
-    ipt.x = convert_int_sat_rtn(pt.x);
-    ipt.y = convert_int_sat_rtn(pt.y);
-
-    float a = pt.x - ipt.x;
-    float b = pt.y - ipt.y;
-
-    float iw00 = ((1.0f - a) * (1.0f - b) * (1 << W_BITS));
-    float iw01 = (a * (1.0f - b) * (1 << W_BITS));
-    float iw10 = ((1.0f - a) * b * (1 << W_BITS));
-    float iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
-
-    __global const float* src_row = src + (int)(ipt.y + y) * srcStep / 4 + ipt.x * cn;
-    __global const float* src_row1 = src + (int)(ipt.y + y + 1) * srcStep / 4 + ipt.x * cn;
-
-    return src_row[(int)x] * iw00 + src_row[(int)x + cn] * iw01 + src_row1[(int)x] * iw10 + src_row1[(int)x + cn] * iw11, W_BITS1 - 5;
-}
-
 #define	BUFFER	64
-
+#ifndef WAVE_SIZE
+#define WAVE_SIZE 1
+#endif
 #ifdef CPU
 void reduce3(float val1, float val2, float val3,  __local float* smem1,  __local float* smem2,  __local float* smem3, int tid)
 {
@@ -193,71 +58,51 @@ void reduce3(float val1, float val2, float val3,  __local float* smem1,  __local
     smem3[tid] = val3;
     barrier(CLK_LOCAL_MEM_FENCE);
 
-#if	BUFFER > 128
-    if (tid < 128)
-    {
-        smem1[tid] = val1 += smem1[tid + 128];
-        smem2[tid] = val2 += smem2[tid + 128];
-        smem3[tid] = val3 += smem3[tid + 128];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-
-#if	BUFFER > 64
-    if (tid < 64)
-    {
-        smem1[tid] = val1 += smem1[tid + 64];
-        smem2[tid] = val2 += smem2[tid + 64];
-        smem3[tid] = val3 += smem3[tid + 64];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-
     if (tid < 32)
     {
-        smem1[tid] = val1 += smem1[tid + 32];
-        smem2[tid] = val2 += smem2[tid + 32];
-        smem3[tid] = val3 += smem3[tid + 32];
+        smem1[tid] += smem1[tid + 32];
+        smem2[tid] += smem2[tid + 32];
+        smem3[tid] += smem3[tid + 32];
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 
     if (tid < 16)
     {
-        smem1[tid] = val1 += smem1[tid + 16];
-        smem2[tid] = val2 += smem2[tid + 16];
-        smem3[tid] = val3 += smem3[tid + 16];
+        smem1[tid] += smem1[tid + 16];
+        smem2[tid] += smem2[tid + 16];
+        smem3[tid] += smem3[tid + 16];
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 
     if (tid < 8)
     {
-        smem1[tid] = val1 += smem1[tid + 8];
-        smem2[tid] = val2 += smem2[tid + 8];
-        smem3[tid] = val3 += smem3[tid + 8];
+        smem1[tid] += smem1[tid + 8];
+        smem2[tid] += smem2[tid + 8];
+        smem3[tid] += smem3[tid + 8];
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 
     if (tid < 4)
     {
-        smem1[tid] = val1 += smem1[tid + 4];
-        smem2[tid] = val2 += smem2[tid + 4];
-        smem3[tid] = val3 += smem3[tid + 4];
+        smem1[tid] += smem1[tid + 4];
+        smem2[tid] += smem2[tid + 4];
+        smem3[tid] += smem3[tid + 4];
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 
     if (tid < 2)
     {
-        smem1[tid] = val1 += smem1[tid + 2];
-        smem2[tid] = val2 += smem2[tid + 2];
-        smem3[tid] = val3 += smem3[tid + 2];
+        smem1[tid] += smem1[tid + 2];
+        smem2[tid] += smem2[tid + 2];
+        smem3[tid] += smem3[tid + 2];
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 
     if (tid < 1)
     {
-        smem1[BUFFER] = val1 += smem1[tid + 1];
-        smem2[BUFFER] = val2 += smem2[tid + 1];
-        smem3[BUFFER] = val3 += smem3[tid + 1];
+        smem1[BUFFER] = smem1[tid] + smem1[tid + 1];
+        smem2[BUFFER] = smem2[tid] + smem2[tid + 1];
+        smem3[BUFFER] = smem3[tid] + smem3[tid + 1];
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 }
@@ -268,63 +113,45 @@ void reduce2(float val1, float val2, volatile __local float* smem1, volatile __l
     smem2[tid] = val2;
     barrier(CLK_LOCAL_MEM_FENCE);
 
-#if	BUFFER > 128
-    if (tid < 128)
-    {
-        smem1[tid] = (val1 += smem1[tid + 128]);
-        smem2[tid] = (val2 += smem2[tid + 128]);
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-
-#if	BUFFER > 64
-    if (tid < 64)
-    {
-        smem1[tid] = (val1 += smem1[tid + 64]);
-        smem2[tid] = (val2 += smem2[tid + 64]);
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-
     if (tid < 32)
     {
-        smem1[tid] = (val1 += smem1[tid + 32]);
-        smem2[tid] = (val2 += smem2[tid + 32]);
+        smem1[tid] += smem1[tid + 32];
+        smem2[tid] += smem2[tid + 32];
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 
     if (tid < 16)
     {
-        smem1[tid] = (val1 += smem1[tid + 16]);
-        smem2[tid] = (val2 += smem2[tid + 16]);
+        smem1[tid] += smem1[tid + 16];
+        smem2[tid] += smem2[tid + 16];
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 
     if (tid < 8)
     {
-        smem1[tid] = (val1 += smem1[tid + 8]);
-        smem2[tid] = (val2 += smem2[tid + 8]);
+        smem1[tid] += smem1[tid + 8];
+        smem2[tid] += smem2[tid + 8];
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 
     if (tid < 4)
     {
-        smem1[tid] = (val1 += smem1[tid + 4]);
-        smem2[tid] = (val2 += smem2[tid + 4]);
+        smem1[tid] += smem1[tid + 4];
+        smem2[tid] += smem2[tid + 4];
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 
     if (tid < 2)
     {
-        smem1[tid] = (val1 += smem1[tid + 2]);
-        smem2[tid] = (val2 += smem2[tid + 2]);
+        smem1[tid] += smem1[tid + 2];
+        smem2[tid] += smem2[tid + 2];
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 
     if (tid < 1)
     {
-        smem1[BUFFER] = (val1 += smem1[tid + 1]);
-        smem2[BUFFER] = (val2 += smem2[tid + 1]);
+        smem1[BUFFER] = smem1[tid] + smem1[tid + 1];
+        smem2[BUFFER] = smem2[tid] + smem2[tid + 1];
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 }
@@ -334,205 +161,146 @@ void reduce1(float val1, volatile __local float* smem1, int tid)
     smem1[tid] = val1;
     barrier(CLK_LOCAL_MEM_FENCE);
 
-#if	BUFFER > 128
-    if (tid < 128)
-    {
-        smem1[tid] = (val1 += smem1[tid + 128]);
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-
-#if	BUFFER > 64
-    if (tid < 64)
-    {
-        smem1[tid] = (val1 += smem1[tid + 64]);
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-
     if (tid < 32)
     {
-        smem1[tid] = (val1 += smem1[tid + 32]);
+        smem1[tid] += smem1[tid + 32];
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 
     if (tid < 16)
     {
-        smem1[tid] = (val1 += smem1[tid + 16]);
+        smem1[tid] += smem1[tid + 16];
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 
     if (tid < 8)
     {
-        smem1[tid] = (val1 += smem1[tid + 8]);
+        smem1[tid] += smem1[tid + 8];
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 
     if (tid < 4)
     {
-        smem1[tid] = (val1 += smem1[tid + 4]);
+        smem1[tid] += smem1[tid + 4];
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 
     if (tid < 2)
     {
-        smem1[tid] = (val1 += smem1[tid + 2]);
+        smem1[tid] += smem1[tid + 2];
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 
     if (tid < 1)
     {
-        smem1[BUFFER] = (val1 += smem1[tid + 1]);
+        smem1[BUFFER] = smem1[tid] + smem1[tid + 1];
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 }
 #else
-void reduce3(float val1, float val2, float val3, __local float* smem1, __local float* smem2, __local float* smem3, int tid)
+void reduce3(float val1, float val2, float val3, 
+__local volatile float* smem1, __local volatile float* smem2, __local volatile float* smem3, int tid)
 {
     smem1[tid] = val1;
     smem2[tid] = val2;
     smem3[tid] = val3;
     barrier(CLK_LOCAL_MEM_FENCE);
 
-#if	BUFFER > 128
-    if (tid < 128)
-    {
-        smem1[tid] = val1 += smem1[tid + 128];
-        smem2[tid] = val2 += smem2[tid + 128];
-        smem3[tid] = val3 += smem3[tid + 128];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-
-#if	BUFFER > 64
-    if (tid < 64)
-    {
-        smem1[tid] = val1 += smem1[tid + 64];
-        smem2[tid] = val2 += smem2[tid + 64];
-        smem3[tid] = val3 += smem3[tid + 64];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-
     if (tid < 32)
     {
-        volatile __local float* vmem1 = smem1;
-        volatile __local float* vmem2 = smem2;
-        volatile __local float* vmem3 = smem3;
+        smem1[tid] += smem1[tid + 32];
+        smem2[tid] += smem2[tid + 32];
+        smem3[tid] += smem3[tid + 32];
+#if WAVE_SIZE < 32
+	} barrier(CLK_LOCAL_MEM_FENCE);
+	if (tid < 16) {
+#endif
+        smem1[tid] += smem1[tid + 16];
+        smem2[tid] += smem2[tid + 16];
+        smem3[tid] += smem3[tid + 16];
+#if WAVE_SIZE <16
+	} barrier(CLK_LOCAL_MEM_FENCE);
+	if (tid < 8) {
+#endif
+        smem1[tid] += smem1[tid + 8];
+        smem2[tid] += smem2[tid + 8];
+        smem3[tid] += smem3[tid + 8];
 
-        vmem1[tid] = val1 += vmem1[tid + 32];
-        vmem2[tid] = val2 += vmem2[tid + 32];
-        vmem3[tid] = val3 += vmem3[tid + 32];
+        smem1[tid] += smem1[tid + 4];
+        smem2[tid] += smem2[tid + 4];
+        smem3[tid] += smem3[tid + 4];
 
-        vmem1[tid] = val1 += vmem1[tid + 16];
-        vmem2[tid] = val2 += vmem2[tid + 16];
-        vmem3[tid] = val3 += vmem3[tid + 16];
+        smem1[tid] += smem1[tid + 2];
+        smem2[tid] += smem2[tid + 2];
+        smem3[tid] += smem3[tid + 2];
 
-        vmem1[tid] = val1 += vmem1[tid + 8];
-        vmem2[tid] = val2 += vmem2[tid + 8];
-        vmem3[tid] = val3 += vmem3[tid + 8];
-
-        vmem1[tid] = val1 += vmem1[tid + 4];
-        vmem2[tid] = val2 += vmem2[tid + 4];
-        vmem3[tid] = val3 += vmem3[tid + 4];
-
-        vmem1[tid] = val1 += vmem1[tid + 2];
-        vmem2[tid] = val2 += vmem2[tid + 2];
-        vmem3[tid] = val3 += vmem3[tid + 2];
-
-        vmem1[tid] = val1 += vmem1[tid + 1];
-        vmem2[tid] = val2 += vmem2[tid + 1];
-        vmem3[tid] = val3 += vmem3[tid + 1];
+        smem1[tid] += smem1[tid + 1];
+        smem2[tid] += smem2[tid + 1];
+        smem3[tid] += smem3[tid + 1];
     }
 }
 
-void reduce2(float val1, float val2, __local float* smem1, __local float* smem2, int tid)
+void reduce2(float val1, float val2, __local volatile float* smem1, __local volatile float* smem2, int tid)
 {
     smem1[tid] = val1;
     smem2[tid] = val2;
     barrier(CLK_LOCAL_MEM_FENCE);
 
-#if	BUFFER > 128
-    if (tid < 128)
-    {
-        smem1[tid] = val1 += smem1[tid + 128];
-        smem2[tid] = val2 += smem2[tid + 128];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-
-#if	BUFFER > 64
-    if (tid < 64)
-    {
-        smem1[tid] = val1 += smem1[tid + 64];
-        smem2[tid] = val2 += smem2[tid + 64];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-
     if (tid < 32)
     {
-        volatile __local float* vmem1 = smem1;
-        volatile __local float* vmem2 = smem2;
+        smem1[tid] += smem1[tid + 32];
+        smem2[tid] += smem2[tid + 32];
+#if WAVE_SIZE < 32
+	} barrier(CLK_LOCAL_MEM_FENCE);
+	if (tid < 16) {
+#endif
+        smem1[tid] += smem1[tid + 16];
+        smem2[tid] += smem2[tid + 16];
+#if WAVE_SIZE <16
+	} barrier(CLK_LOCAL_MEM_FENCE);
+	if (tid < 8) {
+#endif
+        smem1[tid] += smem1[tid + 8];
+        smem2[tid] += smem2[tid + 8];
 
-        vmem1[tid] = val1 += vmem1[tid + 32];
-        vmem2[tid] = val2 += vmem2[tid + 32];
+        smem1[tid] += smem1[tid + 4];
+        smem2[tid] += smem2[tid + 4];
 
-        vmem1[tid] = val1 += vmem1[tid + 16];
-        vmem2[tid] = val2 += vmem2[tid + 16];
+        smem1[tid] += smem1[tid + 2];
+        smem2[tid] += smem2[tid + 2];
 
-        vmem1[tid] = val1 += vmem1[tid + 8];
-        vmem2[tid] = val2 += vmem2[tid + 8];
-
-        vmem1[tid] = val1 += vmem1[tid + 4];
-        vmem2[tid] = val2 += vmem2[tid + 4];
-
-        vmem1[tid] = val1 += vmem1[tid + 2];
-        vmem2[tid] = val2 += vmem2[tid + 2];
-
-        vmem1[tid] = val1 += vmem1[tid + 1];
-        vmem2[tid] = val2 += vmem2[tid + 1];
+        smem1[tid] += smem1[tid + 1];
+        smem2[tid] += smem2[tid + 1];
     }
 }
 
-void reduce1(float val1, __local float* smem1, int tid)
+void reduce1(float val1, __local volatile float* smem1, int tid)
 {
     smem1[tid] = val1;
     barrier(CLK_LOCAL_MEM_FENCE);
 
-#if	BUFFER > 128
-    if (tid < 128)
-    {
-        smem1[tid] = val1 += smem1[tid + 128];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-
-#if	BUFFER > 64
-    if (tid < 64)
-    {
-        smem1[tid] = val1 += smem1[tid + 64];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-
     if (tid < 32)
     {
-        volatile __local float* vmem1 = smem1;
-
-        vmem1[tid] = val1 += vmem1[tid + 32];
-        vmem1[tid] = val1 += vmem1[tid + 16];
-        vmem1[tid] = val1 += vmem1[tid + 8];
-        vmem1[tid] = val1 += vmem1[tid + 4];
-        vmem1[tid] = val1 += vmem1[tid + 2];
-        vmem1[tid] = val1 += vmem1[tid + 1];
+        smem1[tid] += smem1[tid + 32];
+#if WAVE_SIZE < 32
+	} barrier(CLK_LOCAL_MEM_FENCE);
+	if (tid < 16) {
+#endif
+        smem1[tid] += smem1[tid + 16];
+#if WAVE_SIZE <16
+	} barrier(CLK_LOCAL_MEM_FENCE);
+	if (tid < 8) {
+#endif
+        smem1[tid] += smem1[tid + 8];
+        smem1[tid] += smem1[tid + 4];
+        smem1[tid] += smem1[tid + 2];
+        smem1[tid] += smem1[tid + 1];
     }
 }
 #endif
 
 #define SCALE (1.0f / (1 << 20))
 #define	THRESHOLD	0.01f
-#define	DIMENSION	21
 
 // Image read mode
 __constant sampler_t sampler    = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;
diff --git a/modules/ocl/src/precomp.hpp b/modules/ocl/src/precomp.hpp
index b2a3e41c6..4f93eac42 100644
--- a/modules/ocl/src/precomp.hpp
+++ b/modules/ocl/src/precomp.hpp
@@ -78,6 +78,7 @@
 
 #if defined (HAVE_OPENCL)
 
+#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
 #include "opencv2/ocl/private/util.hpp"
 #include "safe_call.hpp"
 
diff --git a/modules/ocl/src/pyrlk.cpp b/modules/ocl/src/pyrlk.cpp
index 6de4f9786..a3e65dde3 100644
--- a/modules/ocl/src/pyrlk.cpp
+++ b/modules/ocl/src/pyrlk.cpp
@@ -15,8 +15,8 @@
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//		Dachuan Zhao, dachuan@multicorewareinc.com
-//		Yao Wang, bitwangyaoyao@gmail.com
+//      Dachuan Zhao, dachuan@multicorewareinc.com
+//      Yao Wang, bitwangyaoyao@gmail.com
 //      Nathan, liujun@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -56,31 +56,16 @@ namespace cv
 {
 namespace ocl
 {
-///////////////////////////OpenCL kernel strings///////////////////////////
 extern const char *pyrlk;
 extern const char *pyrlk_no_image;
-extern const char *arithm_mul;
 }
 }
-
 struct dim3
 {
     unsigned int x, y, z;
 };
 
-struct float2
-{
-    float x, y;
-};
-
-struct int2
-{
-    int x, y;
-};
-
-namespace
-{
-void calcPatchSize(cv::Size winSize, int cn, dim3 &block, dim3 &patch, bool isDeviceArch11)
+static void calcPatchSize(cv::Size winSize, int cn, dim3 &block, dim3 &patch, bool isDeviceArch11)
 {
     winSize.width *= cn;
 
@@ -100,45 +85,6 @@ void calcPatchSize(cv::Size winSize, int cn, dim3 &block, dim3 &patch, bool isDe
 
     block.z = patch.z = 1;
 }
-}
-
-static void multiply_cus(const oclMat &src1, oclMat &dst, float scalar)
-{
-    if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
-    {
-        CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
-        return;
-    }
-
-    CV_Assert(src1.cols == dst.cols &&
-              src1.rows == dst.rows);
-
-    CV_Assert(src1.type() == dst.type());
-    CV_Assert(src1.depth() != CV_8S);
-
-    Context  *clCxt = src1.clCxt;
-
-    size_t localThreads[3]  = { 16, 16, 1 };
-    size_t globalThreads[3] = { src1.cols,
-                                src1.rows,
-                                1
-                              };
-
-    int dst_step1 = dst.cols * dst.elemSize();
-    vector<pair<size_t , const void *> > args;
-    args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src1.step ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src1.offset ));
-    args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src1.rows ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src1.cols ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 ));
-    args.push_back( make_pair( sizeof(float), (float *)&scalar ));
-
-    openCLExecuteKernel(clCxt, &arithm_mul, "arithm_muls", globalThreads, localThreads, args, -1, src1.depth());
-}
 
 static void lkSparse_run(oclMat &I, oclMat &J,
                          const oclMat &prevPts, oclMat &nextPts, oclMat &status, oclMat& err, bool /*GET_MIN_EIGENVALS*/, int ptcount,
@@ -151,15 +97,7 @@ static void lkSparse_run(oclMat &I, oclMat &J,
     size_t localThreads[3]  = { 8, isImageSupported ? 8 : 32, 1 };
     size_t globalThreads[3] = { 8 * ptcount, isImageSupported ? 8 : 32, 1};
     int cn = I.oclchannels();
-    char calcErr;
-    if (level == 0)
-    {
-        calcErr = 1;
-    }
-    else
-    {
-        calcErr = 0;
-    }
+    char calcErr = level==0?1:0;
 
     vector<pair<size_t , const void *> > args;
 
@@ -198,7 +136,17 @@ static void lkSparse_run(oclMat &I, oclMat &J,
     {
         if(isImageSupported)
         {
-            openCLExecuteKernel(clCxt, &pyrlk, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth());
+            stringstream idxStr;
+            idxStr << kernelName << "_C" << I.oclchannels() << "_D" << I.depth();
+            cl_kernel kernel = openCLGetKernelFromSource(clCxt, &pyrlk, idxStr.str());
+            int wave_size = queryDeviceInfo<WAVEFRONT_SIZE, int>(kernel);
+            openCLSafeCall(clReleaseKernel(kernel));
+
+            static char opt[16] = {0};
+            sprintf(opt, " -D WAVE_SIZE=%d", wave_size);
+
+            openCLExecuteKernel(clCxt, &pyrlk, kernelName, globalThreads, localThreads, 
+                                args, I.oclchannels(), I.depth(), opt);
             releaseTexture(ITex);
             releaseTexture(JTex);
         }
@@ -241,8 +189,7 @@ void cv::ocl::PyrLKOpticalFlow::sparse(const oclMat &prevImg, const oclMat &next
 
     oclMat temp1 = (useInitialFlow ? nextPts : prevPts).reshape(1);
     oclMat temp2 = nextPts.reshape(1);
-    multiply_cus(temp1, temp2, 1.0f / (1 << maxLevel) / 2.0f);
-    //::multiply(temp1, 1.0f / (1 << maxLevel) / 2.0f, temp2);
+    multiply(1.0f/(1<<maxLevel)/2.0f, temp1, temp2);
 
     ensureSizeIsEnough(1, prevPts.cols, CV_8UC1, status);
     status.setTo(Scalar::all(1));
@@ -257,7 +204,6 @@ void cv::ocl::PyrLKOpticalFlow::sparse(const oclMat &prevImg, const oclMat &next
         ensureSizeIsEnough(1, prevPts.cols, CV_32FC1, *err);
 
     // build the image pyramids.
-
     prevPyr_.resize(maxLevel + 1);
     nextPyr_.resize(maxLevel + 1);
 
@@ -274,7 +220,6 @@ void cv::ocl::PyrLKOpticalFlow::sparse(const oclMat &prevImg, const oclMat &next
     }
 
     // dI/dx ~ Ix, dI/dy ~ Iy
-
     for (int level = maxLevel; level >= 0; level--)
     {
         lkSparse_run(prevPyr_[level], nextPyr_[level],
diff --git a/modules/ocl/src/safe_call.hpp b/modules/ocl/src/safe_call.hpp
index 441495f86..ba36cabd3 100644
--- a/modules/ocl/src/safe_call.hpp
+++ b/modules/ocl/src/safe_call.hpp
@@ -47,7 +47,7 @@
 #define __OPENCV_OPENCL_SAFE_CALL_HPP__
 
 #if defined __APPLE__
-#include <OpenCL/OpenCL.h>
+#include <OpenCL/opencl.h>
 #else
 #include <CL/cl.h>
 #endif
diff --git a/modules/ocl/src/tvl1flow.cpp b/modules/ocl/src/tvl1flow.cpp
index 8182f41a1..a322f62a4 100644
--- a/modules/ocl/src/tvl1flow.cpp
+++ b/modules/ocl/src/tvl1flow.cpp
@@ -472,4 +472,8 @@ void ocl_tvl1flow::warpBackward(const oclMat &I0, const oclMat &I1, oclMat &I1x,
     args.push_back( make_pair( sizeof(cl_int), (void*)&u2_offset_y));
 
     openCLExecuteKernel(clCxt, &tvl1flow, kernelName, globalThread, localThread, args, -1, -1);
+
+    releaseTexture(I1_tex);
+    releaseTexture(I1x_tex);
+    releaseTexture(I1y_tex);
 }
\ No newline at end of file
diff --git a/samples/python2/grabcut.py b/samples/python2/grabcut.py
new file mode 100644
index 000000000..9fc1280ac
--- /dev/null
+++ b/samples/python2/grabcut.py
@@ -0,0 +1,174 @@
+#!/usr/bin/env python
+'''
+===============================================================================
+Interactive Image Segmentation using GrabCut algorithm.
+
+This sample shows interactive image segmentation using grabcut algorithm.
+
+USAGE :
+    python grabcut.py <filename>
+
+README FIRST:    
+    Two windows will show up, one for input and one for output.
+    
+    At first, in input window, draw a rectangle around the object using 
+mouse right button. Then press 'n' to segment the object (once or a few times)
+For any finer touch-ups, you can press any of the keys below and draw lines on 
+the areas you want. Then again press 'n' for updating the output.
+
+Key '0' - To select areas of sure background
+Key '1' - To select areas of sure foreground
+Key '2' - To select areas of probable background
+Key '3' - To select areas of probable foreground
+
+Key 'n' - To update the segmentation
+Key 'r' - To reset the setup
+Key 's' - To save the results
+===============================================================================
+'''
+
+import numpy as np
+import cv2
+import sys
+
+BLUE = [255,0,0]        # rectangle color
+RED = [0,0,255]         # PR BG
+GREEN = [0,255,0]       # PR FG
+BLACK = [0,0,0]         # sure BG
+WHITE = [255,255,255]   # sure FG
+
+DRAW_BG = {'color' : BLACK, 'val' : 0}
+DRAW_FG = {'color' : WHITE, 'val' : 1}
+DRAW_PR_FG = {'color' : GREEN, 'val' : 3}
+DRAW_PR_BG = {'color' : RED, 'val' : 2}
+
+# setting up flags
+rect = (0,0,1,1)
+drawing = False         # flag for drawing curves
+rectangle = False       # flag for drawing rect
+rect_over = False       # flag to check if rect drawn
+rect_or_mask = 100      # flag for selecting rect or mask mode
+value = DRAW_FG         # drawing initialized to FG
+thickness = 3           # brush thickness
+
+def onmouse(event,x,y,flags,param):
+    global img,img2,drawing,value,mask,rectangle,rect,rect_or_mask,ix,iy,rect_over
+    
+    # Draw Rectangle
+    if event == cv2.EVENT_RBUTTONDOWN:
+        rectangle = True
+        ix,iy = x,y
+
+    elif event == cv2.EVENT_MOUSEMOVE:
+        if rectangle == True:
+            img = img2.copy()
+            cv2.rectangle(img,(ix,iy),(x,y),BLUE,2)
+            rect = (ix,iy,abs(ix-x),abs(iy-y))
+            rect_or_mask = 0
+
+    elif event == cv2.EVENT_RBUTTONUP:
+        rectangle = False
+        rect_over = True
+        cv2.rectangle(img,(ix,iy),(x,y),BLUE,2)
+        rect = (ix,iy,abs(ix-x),abs(iy-y))
+        rect_or_mask = 0
+        print " Now press the key 'n' a few times until no further change \n"
+        
+    # draw touchup curves
+    
+    if event == cv2.EVENT_LBUTTONDOWN:
+        if rect_over == False:
+            print "first draw rectangle \n"
+        else:
+            drawing = True
+            cv2.circle(img,(x,y),thickness,value['color'],-1)
+            cv2.circle(mask,(x,y),thickness,value['val'],-1)
+
+    elif event == cv2.EVENT_MOUSEMOVE:
+        if drawing == True:
+            cv2.circle(img,(x,y),thickness,value['color'],-1)
+            cv2.circle(mask,(x,y),thickness,value['val'],-1)
+
+    elif event == cv2.EVENT_LBUTTONUP:
+        if drawing == True:
+            drawing = False
+            cv2.circle(img,(x,y),thickness,value['color'],-1)
+            cv2.circle(mask,(x,y),thickness,value['val'],-1)
+        
+# print documentation
+print __doc__
+
+# Loading images
+if len(sys.argv) == 2:
+    filename = sys.argv[1] # for drawing purposes
+else:
+    print "No input image given, so loading default image, lena.jpg \n"
+    print "Correct Usage : python grabcut.py <filename> \n"
+    filename = '../cpp/lena.jpg'
+
+img = cv2.imread(filename)
+img2 = img.copy()                               # a copy of original image
+mask = np.zeros(img.shape[:2],dtype = np.uint8) # mask initialized to PR_BG
+output = np.zeros(img.shape,np.uint8)           # output image to be shown
+
+# input and output windows
+cv2.namedWindow('output')
+cv2.namedWindow('input')
+cv2.setMouseCallback('input',onmouse)
+cv2.moveWindow('input',img.shape[1]+10,90)
+
+print " Instructions : \n"
+print " Draw a rectangle around the object using right mouse button \n"
+
+while(1):
+
+    cv2.imshow('output',output)
+    cv2.imshow('input',img)
+    k = 0xFF & cv2.waitKey(1)
+    
+    # key bindings
+    if k == 27:         # esc to exit
+        break
+    elif k == ord('0'): # BG drawing
+        print " mark background regions with left mouse button \n"
+        value = DRAW_BG
+    elif k == ord('1'): # FG drawing
+        print " mark foreground regions with left mouse button \n"
+        value = DRAW_FG
+    elif k == ord('2'): # PR_BG drawing
+        value = DRAW_PR_BG
+    elif k == ord('3'): # PR_FG drawing
+        value = DRAW_PR_FG
+    elif k == ord('s'): # save image
+        bar = np.zeros((img.shape[0],5,3),np.uint8)
+        res = np.hstack((img2,bar,img,bar,output))
+        cv2.imwrite('grabcut_output.png',res)
+        print " Result saved as image \n"
+    elif k == ord('r'): # reset everything
+        print "resetting \n"
+        rect = (0,0,1,1)
+        drawing = False         
+        rectangle = False       
+        rect_or_mask = 100 
+        rect_over = False     
+        value = DRAW_FG         
+        img = img2.copy()
+        mask = np.zeros(img.shape[:2],dtype = np.uint8) # mask initialized to PR_BG
+        output = np.zeros(img.shape,np.uint8)           # output image to be shown
+    elif k == ord('n'): # segment the image
+        print """ For finer touchups, mark foreground and background after pressing keys 0-3
+        and again press 'n' \n"""
+        if (rect_or_mask == 0):         # grabcut with rect
+            bgdmodel = np.zeros((1,65),np.float64)
+            fgdmodel = np.zeros((1,65),np.float64)    
+            cv2.grabCut(img2,mask,rect,bgdmodel,fgdmodel,1,cv2.GC_INIT_WITH_RECT)
+            rect_or_mask = 1
+        elif rect_or_mask == 1:         # grabcut with mask
+            bgdmodel = np.zeros((1,65),np.float64)
+            fgdmodel = np.zeros((1,65),np.float64) 
+            cv2.grabCut(img2,mask,rect,bgdmodel,fgdmodel,1,cv2.GC_INIT_WITH_MASK)
+
+    mask2 = np.where((mask==1) + (mask==3),255,0).astype('uint8')
+    output = cv2.bitwise_and(img2,img2,mask=mask2)   
+
+cv2.destroyAllWindows()