Normalize line endings and whitespace

2012-10-17 03:18:30 +04:00
parent 69020da607
commit 04384a71e4
1516 changed files with 258846 additions and 258162 deletions
--- a/modules/ocl/src/kernels/arithm_LUT.cl
+++ b/modules/ocl/src/kernels/arithm_LUT.cl
@@ -40,123 +40,123 @@

 __kernel
 void LUT_C1_D0( __global uchar *dst,
-	  __global const uchar *src,
-	  __constant uchar *table,
-	  int rows,
-	  int cols,
-	  int channels,
-	  int whole_rows,
-	  int whole_cols,
-	  int src_offset,
-	  int dst_offset,
-	  int lut_offset,
-	  int src_step,
-	  int dst_step)
+      __global const uchar *src,
+      __constant uchar *table,
+      int rows,
+      int cols,
+      int channels,
+      int whole_rows,
+      int whole_cols,
+      int src_offset,
+      int dst_offset,
+      int lut_offset,
+      int src_step,
+      int dst_step)
 {
-	int gidx = get_global_id(0)<<2;
-	int gidy = get_global_id(1);
-	int lidx = get_local_id(0);
-	int lidy = get_local_id(1);
+    int gidx = get_global_id(0)<<2;
+    int gidy = get_global_id(1);
+    int lidx = get_local_id(0);
+    int lidy = get_local_id(1);

-	__local uchar l[256];
-	l[(lidy<<4)+lidx] = table[(lidy<<4)+lidx+lut_offset];
-	//mem_fence(CLK_LOCAL_MEM_FENCE);
+    __local uchar l[256];
+    l[(lidy<<4)+lidx] = table[(lidy<<4)+lidx+lut_offset];
+    //mem_fence(CLK_LOCAL_MEM_FENCE);


-	//clamp(gidx,mask,cols-1);
-	gidx = gidx >= cols-4?cols-4:gidx;
-	gidy = gidy >= rows?rows-1:gidy;
+    //clamp(gidx,mask,cols-1);
+    gidx = gidx >= cols-4?cols-4:gidx;
+    gidy = gidy >= rows?rows-1:gidy;

-	int src_index = src_offset + mad24(gidy,src_step,gidx);	
-	int dst_index = dst_offset + mad24(gidy,dst_step,gidx);
-	uchar4 p,q;
-	barrier(CLK_LOCAL_MEM_FENCE);
-	p.x = src[src_index];
-	p.y = src[src_index+1];
-	p.z = src[src_index+2];
-	p.w = src[src_index+3];
+    int src_index = src_offset + mad24(gidy,src_step,gidx);
+    int dst_index = dst_offset + mad24(gidy,dst_step,gidx);
+    uchar4 p,q;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    p.x = src[src_index];
+    p.y = src[src_index+1];
+    p.z = src[src_index+2];
+    p.w = src[src_index+3];

-	q.x = l[p.x];
-	q.y = l[p.y];
-	q.z = l[p.z];
-	q.w = l[p.w];
-	*(__global uchar4*)(dst + dst_index) = q;
+    q.x = l[p.x];
+    q.y = l[p.y];
+    q.z = l[p.z];
+    q.w = l[p.w];
+    *(__global uchar4*)(dst + dst_index) = q;
 }

 __kernel
 void LUT2_C1_D0( __global uchar *dst,
-	  __global const uchar *src,
-	  __constant uchar *table,
-	  int rows,
-	  int precols,
-	  int channels,
-	  int whole_rows,
-	  int cols,
-	  int src_offset,
-	  int dst_offset,
-	  int lut_offset,
-	  int src_step,
-	  int dst_step)
+      __global const uchar *src,
+      __constant uchar *table,
+      int rows,
+      int precols,
+      int channels,
+      int whole_rows,
+      int cols,
+      int src_offset,
+      int dst_offset,
+      int lut_offset,
+      int src_step,
+      int dst_step)
 {
-	int gidx = get_global_id(0);
-	int gidy = get_global_id(1);
-	//int lidx = get_local_id(0);
-	int lidy = get_local_id(1);
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+    //int lidx = get_local_id(0);
+    int lidy = get_local_id(1);

-	__local uchar l[256];
-	l[lidy] = table[lidy+lut_offset];
-	//mem_fence(CLK_LOCAL_MEM_FENCE);
+    __local uchar l[256];
+    l[lidy] = table[lidy+lut_offset];
+    //mem_fence(CLK_LOCAL_MEM_FENCE);


-	//clamp(gidx,mask,cols-1);
-	gidx = gidx >= precols ? cols+gidx : gidx;
-	gidy = gidy >= rows?rows-1:gidy;
+    //clamp(gidx,mask,cols-1);
+    gidx = gidx >= precols ? cols+gidx : gidx;
+    gidy = gidy >= rows?rows-1:gidy;

-	int src_index = src_offset + mad24(gidy,src_step,gidx);	
-	int dst_index = dst_offset + mad24(gidy,dst_step,gidx);
-	//uchar4 p,q;
-	barrier(CLK_LOCAL_MEM_FENCE);
-	uchar p = src[src_index];
-	uchar q = l[p];
-	dst[dst_index] = q;
+    int src_index = src_offset + mad24(gidy,src_step,gidx);
+    int dst_index = dst_offset + mad24(gidy,dst_step,gidx);
+    //uchar4 p,q;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uchar p = src[src_index];
+    uchar q = l[p];
+    dst[dst_index] = q;
 }

 __kernel
 void LUT_C4_D0( __global uchar4 *dst,
-	  __global uchar4 *src,
-	  __constant uchar *table,
-	  int rows,
-	  int cols,
-	  int channels,
-	  int whole_rows,
-	  int whole_cols,
-	  int src_offset,
-	  int dst_offset,
-	  int lut_offset,
-	  int src_step,
-	  int dst_step)
+      __global uchar4 *src,
+      __constant uchar *table,
+      int rows,
+      int cols,
+      int channels,
+      int whole_rows,
+      int whole_cols,
+      int src_offset,
+      int dst_offset,
+      int lut_offset,
+      int src_step,
+      int dst_step)
 {
-	int gidx = get_global_id(0);
-	int gidy = get_global_id(1);
-	
-	int lidx = get_local_id(0);
-	int lidy = get_local_id(1);
-	
-	int src_index = mad24(gidy,src_step,gidx+src_offset);
-	int dst_index = mad24(gidy,dst_step,gidx+dst_offset);
-	__local uchar l[256];
-	l[lidy*16+lidx] = table[lidy*16+lidx+lut_offset];
-	//mem_fence(CLK_LOCAL_MEM_FENCE);
-	barrier(CLK_LOCAL_MEM_FENCE);
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);

-	if(gidx<cols && gidy<rows)
-	{
-		uchar4 p = src[src_index];
-		uchar4 q;
-		q.x = l[p.x];	
-		q.y = l[p.y];	
-		q.z = l[p.z];	
-		q.w = l[p.w];	
-		dst[dst_index] = q;
-	}
+    int lidx = get_local_id(0);
+    int lidy = get_local_id(1);
+
+    int src_index = mad24(gidy,src_step,gidx+src_offset);
+    int dst_index = mad24(gidy,dst_step,gidx+dst_offset);
+    __local uchar l[256];
+    l[lidy*16+lidx] = table[lidy*16+lidx+lut_offset];
+    //mem_fence(CLK_LOCAL_MEM_FENCE);
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(gidx<cols && gidy<rows)
+    {
+        uchar4 p = src[src_index];
+        uchar4 q;
+        q.x = l[p.x];
+        q.y = l[p.y];
+        q.z = l[p.z];
+        q.w = l[p.w];
+        dst[dst_index] = q;
+    }
 }
--- a/modules/ocl/src/kernels/arithm_absdiff.cl
+++ b/modules/ocl/src/kernels/arithm_absdiff.cl
@@ -64,28 +64,28 @@ __kernel void arithm_absdiff_D0 (__global uchar *src1, int src1_step, int src1_o
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
        uchar4 src2_data = vload4(0, src2 + src2_index_fix);
-		if(src1_index < 0)
-		{
-			uchar4 tmp;
-			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-		}
-		if(src2_index < 0)
-		{
-			uchar4 tmp;
-			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-		}	
+        if(src1_index < 0)
+        {
+            uchar4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            uchar4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }

        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
        uchar4 tmp_data = abs_diff(src1_data, src2_data);
@@ -112,8 +112,8 @@ __kernel void arithm_absdiff_D2 (__global ushort *src1, int src1_step, int src1_
        x = x << 2;

        #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -146,8 +146,8 @@ __kernel void arithm_absdiff_D3 (__global short *src1, int src1_step, int src1_o
        x = x << 2;

        #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -250,20 +250,20 @@ __kernel void arithm_s_absdiff_C1_D0 (__global   uchar *src1, int src1_step, int
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
        int4 src2_data = (int4)(src2.x, src2.x, src2.x, src2.x);
-		if(src1_index < 0)
-		{
-			uchar4 tmp;
-			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-		}
+        if(src1_index < 0)
+        {
+            uchar4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }

        uchar4 data = *((__global uchar4 *)(dst + dst_index));
        uchar4 tmp_data = convert_uchar4_sat(abs_diff(convert_int4_sat(src1_data), src2_data));
@@ -289,7 +289,7 @@ __kernel void arithm_s_absdiff_C1_D2 (__global   ushort *src1, int src1_step, in
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -320,7 +320,7 @@ __kernel void arithm_s_absdiff_C1_D3 (__global   short *src1, int src1_step, int
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -423,7 +423,7 @@ __kernel void arithm_s_absdiff_C2_D0 (__global   uchar *src1, int src1_step, int
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -565,7 +565,7 @@ __kernel void arithm_s_absdiff_C3_D0 (__global   uchar *src1, int src1_step, int
        x = x << 2;

        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
-        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); 
+        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -575,9 +575,9 @@ __kernel void arithm_s_absdiff_C3_D0 (__global   uchar *src1, int src1_step, int
        uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
        uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);

-        int4 src2_data_0 = (int4)(src2.x, src2.y, src2.z, src2.x); 
+        int4 src2_data_0 = (int4)(src2.x, src2.y, src2.z, src2.x);
        int4 src2_data_1 = (int4)(src2.y, src2.z, src2.x, src2.y);
-        int4 src2_data_2 = (int4)(src2.z, src2.x, src2.y, src2.z); 
+        int4 src2_data_2 = (int4)(src2.z, src2.x, src2.y, src2.z);

        uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
        uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
@@ -588,17 +588,17 @@ __kernel void arithm_s_absdiff_C3_D0 (__global   uchar *src1, int src1_step, int
        uchar4 tmp_data_2 = convert_uchar4_sat(abs_diff(convert_int4_sat(src1_data_2), src2_data_2));

        data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
-        data_0.w   = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+        data_0.w   = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
                     ? tmp_data_0.w : data_0.w;

-        data_1.xy  = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+        data_1.xy  = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
                     ? tmp_data_1.xy : data_1.xy;
-        data_1.zw  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.zw  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.zw : data_1.zw;

-        data_2.x   = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.x   = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.x : data_2.x;
-        data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) 
+        data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
                     ? tmp_data_2.yzw : data_2.yzw;

        *((__global uchar4 *)(dst + dst_index + 0)) = data_0;
@@ -619,7 +619,7 @@ __kernel void arithm_s_absdiff_C3_D2 (__global   ushort *src1, int src1_step, in
        x = x << 1;

        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
-        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); 
+        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -643,12 +643,12 @@ __kernel void arithm_s_absdiff_C3_D2 (__global   ushort *src1, int src1_step, in

        data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

-        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) 
+        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
                     ? tmp_data_1.x : data_1.x;
-        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.y : data_1.y;

-        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.xy : data_2.xy;

       *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -669,7 +669,7 @@ __kernel void arithm_s_absdiff_C3_D3 (__global   short *src1, int src1_step, int
        x = x << 1;

        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
-        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); 
+        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -693,12 +693,12 @@ __kernel void arithm_s_absdiff_C3_D3 (__global   short *src1, int src1_step, int

        data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

-        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) 
+        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
                     ? tmp_data_1.x : data_1.x;
-        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.y : data_1.y;

-        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.xy : data_2.xy;

       *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -716,7 +716,7 @@ __kernel void arithm_s_absdiff_C3_D4 (__global   int *src1, int src1_step, int s

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); 
+        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));

        int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
@@ -750,13 +750,13 @@ __kernel void arithm_s_absdiff_C3_D5 (__global   float *src1, int src1_step, int

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); 
+        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));

        float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0));
        float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4));
        float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8));
-                                             
+
        float src2_data_0 = src2.x;
        float src2_data_1 = src2.y;
        float src2_data_2 = src2.z;
@@ -786,13 +786,13 @@ __kernel void arithm_s_absdiff_C3_D6 (__global   double *src1, int src1_step, in

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); 
+        int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
        int dst_index  = mad24(y, dst_step, dst_offset + (x * 24));

        double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 ));
        double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 ));
        double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16));
-                                               
+
        double src2_data_0 = src2.x;
        double src2_data_1 = src2.y;
        double src2_data_2 = src2.z;
--- a/modules/ocl/src/kernels/arithm_add.cl
+++ b/modules/ocl/src/kernels/arithm_add.cl
@@ -65,28 +65,28 @@ __kernel void arithm_add_D0 (__global uchar *src1, int src1_step, int src1_offse
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
        uchar4 src2_data = vload4(0, src2 + src2_index_fix);
-		if(src1_index < 0)
-		{
-			uchar4 tmp;
-			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-		}
-		if(src2_index < 0)
-		{
-			uchar4 tmp;
-			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-		}		
+        if(src1_index < 0)
+        {
+            uchar4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            uchar4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
        short4 tmp      = convert_short4_sat(src1_data) + convert_short4_sat(src2_data);
        uchar4 tmp_data = convert_uchar4_sat(tmp);
@@ -113,8 +113,8 @@ __kernel void arithm_add_D2 (__global ushort *src1, int src1_step, int src1_offs
        x = x << 2;

        #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -148,8 +148,8 @@ __kernel void arithm_add_D3 (__global short *src1, int src1_step, int src1_offse
        x = x << 2;

        #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -253,38 +253,38 @@ __kernel void arithm_add_with_mask_C1_D0 (__global uchar *src1, int src1_step, i
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-		int mask_index_fix = mask_index < 0 ? 0 : mask_index;	
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int mask_index_fix = mask_index < 0 ? 0 : mask_index;
        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
        uchar4 src2_data = vload4(0, src2 + src2_index_fix);
-        uchar4 mask_data = vload4(0, mask + mask_index_fix);		
-		if(src1_index < 0)
-		{
-			uchar4 tmp;
-			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-		}
-		if(src2_index < 0)
-		{
-			uchar4 tmp;
-			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-		}	
-		if(mask_index < 0)
-		{
-			uchar4 tmp;
-			tmp.xyzw = (mask_index == -2) ? mask_data.zwxy:mask_data.yzwx;
-			mask_data.xyzw = (mask_index == -1) ? mask_data.wxyz:tmp.xyzw;
-		}	
-		
+        uchar4 mask_data = vload4(0, mask + mask_index_fix);
+        if(src1_index < 0)
+        {
+            uchar4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            uchar4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+        if(mask_index < 0)
+        {
+            uchar4 tmp;
+            tmp.xyzw = (mask_index == -2) ? mask_data.zwxy:mask_data.yzwx;
+            mask_data.xyzw = (mask_index == -1) ? mask_data.wxyz:tmp.xyzw;
+        }
+
        uchar4 data = *((__global uchar4 *)(dst + dst_index));
        short4 tmp = convert_short4_sat(src1_data) + convert_short4_sat(src2_data);
        uchar4 tmp_data = convert_uchar4_sat(tmp);
@@ -312,8 +312,8 @@ __kernel void arithm_add_with_mask_C1_D2 (__global ushort *src1, int src1_step,
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -349,8 +349,8 @@ __kernel void arithm_add_with_mask_C1_D3 (__global short *src1, int src1_step, i
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -395,7 +395,7 @@ __kernel void arithm_add_with_mask_C1_D4 (__global int   *src1, int src1_step, i
        int dst_data  = *((__global int *)((__global char *)dst  + dst_index));

        int data = convert_int_sat((long)src_data1 + (long)src_data2);
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global int *)((__global char *)dst + dst_index)) = data;
    }
@@ -425,7 +425,7 @@ __kernel void arithm_add_with_mask_C1_D5 (__global float *src1, int src1_step, i
        float dst_data  = *((__global float *)((__global char *)dst  + dst_index));

        float data = src_data1 + src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global float *)((__global char *)dst + dst_index)) = data;
    }
@@ -456,7 +456,7 @@ __kernel void arithm_add_with_mask_C1_D6 (__global double *src1, int src1_step,
        double dst_data  = *((__global double *)((__global char *)dst  + dst_index));

        double data = src_data1 + src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global double *)((__global char *)dst + dst_index)) = data;
    }
@@ -478,8 +478,8 @@ __kernel void arithm_add_with_mask_C2_D0 (__global uchar *src1, int src1_step, i
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -525,7 +525,7 @@ __kernel void arithm_add_with_mask_C2_D2 (__global ushort *src1, int src1_step,

        int2    tmp = convert_int2_sat(src_data1) + convert_int2_sat(src_data2);
        ushort2 data = convert_ushort2_sat(tmp);
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
    }
@@ -555,7 +555,7 @@ __kernel void arithm_add_with_mask_C2_D3 (__global short *src1, int src1_step, i

        int2    tmp = convert_int2_sat(src_data1) + convert_int2_sat(src_data2);
        short2 data = convert_short2_sat(tmp);
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global short2 *)((__global char *)dst + dst_index)) = data;
    }
@@ -584,7 +584,7 @@ __kernel void arithm_add_with_mask_C2_D4 (__global int   *src1, int src1_step, i
        int2 dst_data  = *((__global int2 *)((__global char *)dst  + dst_index));

        int2 data = convert_int2_sat(convert_long2_sat(src_data1) + convert_long2_sat(src_data2));
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global int2 *)((__global char *)dst + dst_index)) = data;
    }
@@ -613,7 +613,7 @@ __kernel void arithm_add_with_mask_C2_D5 (__global float *src1, int src1_step, i
        float2 dst_data  = *((__global float2 *)((__global char *)dst  + dst_index));

        float2 data = src_data1 + src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global float2 *)((__global char *)dst + dst_index)) = data;
    }
@@ -644,7 +644,7 @@ __kernel void arithm_add_with_mask_C2_D6 (__global double *src1, int src1_step,
        double2 dst_data  = *((__global double2 *)((__global char *)dst  + dst_index));

        double2 data = src_data1 + src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global double2 *)((__global char *)dst + dst_index)) = data;
    }
@@ -665,8 +665,8 @@ __kernel void arithm_add_with_mask_C3_D0 (__global uchar *src1, int src1_step, i
        x = x << 2;

        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
-        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); 
-        int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3)); 
+        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
+        int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -692,17 +692,17 @@ __kernel void arithm_add_with_mask_C3_D0 (__global uchar *src1, int src1_step, i
        uchar4 tmp_data_2 = convert_uchar4_sat(convert_short4_sat(src1_data_2) + convert_short4_sat(src2_data_2));

        data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
-        data_0.w   = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+        data_0.w   = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
                     ? tmp_data_0.w : data_0.w;

-        data_1.xy  = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+        data_1.xy  = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
                     ? tmp_data_1.xy : data_1.xy;
-        data_1.zw  = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.zw  = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.zw : data_1.zw;

-        data_2.x   = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.x   = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.x : data_2.x;
-        data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) 
+        data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
                     ? tmp_data_2.yzw : data_2.yzw;

        *((__global uchar4 *)(dst + dst_index + 0)) = data_0;
@@ -725,8 +725,8 @@ __kernel void arithm_add_with_mask_C3_D2 (__global ushort *src1, int src1_step,
        x = x << 1;

        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
-        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); 
-        int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6)); 
+        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
+        int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -753,12 +753,12 @@ __kernel void arithm_add_with_mask_C3_D2 (__global ushort *src1, int src1_step,

        data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

-        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) 
+        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
                     ? tmp_data_1.x : data_1.x;
-        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.y : data_1.y;

-        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.xy : data_2.xy;

       *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -781,8 +781,8 @@ __kernel void arithm_add_with_mask_C3_D3 (__global short *src1, int src1_step, i
        x = x << 1;

        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
-        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); 
-        int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6)); 
+        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
+        int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -809,12 +809,12 @@ __kernel void arithm_add_with_mask_C3_D3 (__global short *src1, int src1_step, i

        data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

-        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) 
+        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
                     ? tmp_data_1.x : data_1.x;
-        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.y : data_1.y;

-        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.xy : data_2.xy;

       *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -834,8 +834,8 @@ __kernel void arithm_add_with_mask_C3_D4 (__global int   *src1, int src1_step, i

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); 
-        int src2_index = mad24(y, src2_step, (x * 12) + src2_offset); 
+        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
+        int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
        int mask_index = mad24(y, mask_step, x + mask_offset);
        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));

@@ -878,15 +878,15 @@ __kernel void arithm_add_with_mask_C3_D5 (__global float *src1, int src1_step, i

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); 
-        int src2_index = mad24(y, src2_step, (x * 12) + src2_offset); 
+        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
+        int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
        int mask_index = mad24(y, mask_step, x + mask_offset);
        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));

        float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0));
        float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4));
        float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8));
-                                             
+
        float src2_data_0 = *((__global float *)((__global char *)src2 + src2_index + 0));
        float src2_data_1 = *((__global float *)((__global char *)src2 + src2_index + 4));
        float src2_data_2 = *((__global float *)((__global char *)src2 + src2_index + 8));
@@ -924,15 +924,15 @@ __kernel void arithm_add_with_mask_C3_D6 (__global double *src1, int src1_step,

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); 
-        int src2_index = mad24(y, src2_step, (x * 24) + src2_offset); 
+        int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
+        int src2_index = mad24(y, src2_step, (x * 24) + src2_offset);
        int mask_index = mad24(y, mask_step, x + mask_offset);
        int dst_index  = mad24(y, dst_step, dst_offset + (x * 24));

        double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 ));
        double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 ));
        double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16));
-                                               
+
        double src2_data_0 = *((__global double *)((__global char *)src2 + src2_index + 0 ));
        double src2_data_1 = *((__global double *)((__global char *)src2 + src2_index + 8 ));
        double src2_data_2 = *((__global double *)((__global char *)src2 + src2_index + 16));
@@ -981,7 +981,7 @@ __kernel void arithm_add_with_mask_C4_D0 (__global uchar *src1, int src1_step, i
        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));

        uchar4 data = convert_uchar4_sat(convert_ushort4_sat(src_data1) + convert_ushort4_sat(src_data2));
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global uchar4 *)(dst + dst_index)) = data;
    }
@@ -1010,7 +1010,7 @@ __kernel void arithm_add_with_mask_C4_D2 (__global ushort *src1, int src1_step,
        ushort4 dst_data  = *((__global ushort4 *)((__global char *)dst  + dst_index));

        ushort4 data = convert_ushort4_sat(convert_int4_sat(src_data1) + convert_int4_sat(src_data2));
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
    }
@@ -1039,7 +1039,7 @@ __kernel void arithm_add_with_mask_C4_D3 (__global short *src1, int src1_step, i
        short4 dst_data  = *((__global short4 *)((__global char *)dst  + dst_index));

        short4 data = convert_short4_sat(convert_int4_sat(src_data1) + convert_int4_sat(src_data2));
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global short4 *)((__global char *)dst + dst_index)) = data;
    }
@@ -1068,7 +1068,7 @@ __kernel void arithm_add_with_mask_C4_D4 (__global int   *src1, int src1_step, i
        int4 dst_data  = *((__global int4 *)((__global char *)dst  + dst_index));

        int4 data = convert_int4_sat(convert_long4_sat(src_data1) + convert_long4_sat(src_data2));
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global int4 *)((__global char *)dst + dst_index)) = data;
    }
@@ -1097,7 +1097,7 @@ __kernel void arithm_add_with_mask_C4_D5 (__global float *src1, int src1_step, i
        float4 dst_data  = *((__global float4 *)((__global char *)dst  + dst_index));

        float4 data = src_data1 + src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global float4 *)((__global char *)dst + dst_index)) = data;
    }
@@ -1128,7 +1128,7 @@ __kernel void arithm_add_with_mask_C4_D6 (__global double *src1, int src1_step,
        double4 dst_data  = *((__global double4 *)((__global char *)dst  + dst_index));

        double4 data = src_data1 + src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global double4 *)((__global char *)dst + dst_index)) = data;
    }
--- a/modules/ocl/src/kernels/arithm_addWeighted.cl
+++ b/modules/ocl/src/kernels/arithm_addWeighted.cl
@@ -61,30 +61,30 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset
    int y = get_global_id(1);

    if (x < cols && y < rows)
-    
+
    {

        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);

-		uchar4 src1_data ,src2_data;
+        uchar4 src1_data ,src2_data;

-		src1_data.x= src1_index+0 >= 0 ? src1[src1_index+0] : 0;
-		src1_data.y= src1_index+1 >= 0 ? src1[src1_index+1] : 0;
-		src1_data.z= src1_index+2 >= 0 ? src1[src1_index+2] : 0;
-		src1_data.w= src1_index+3 >= 0 ? src1[src1_index+3] : 0;
+        src1_data.x= src1_index+0 >= 0 ? src1[src1_index+0] : 0;
+        src1_data.y= src1_index+1 >= 0 ? src1[src1_index+1] : 0;
+        src1_data.z= src1_index+2 >= 0 ? src1[src1_index+2] : 0;
+        src1_data.w= src1_index+3 >= 0 ? src1[src1_index+3] : 0;

-		src2_data.x= src2_index+0 >= 0 ? src2[src2_index+0] : 0;
-		src2_data.y= src2_index+1 >= 0 ? src2[src2_index+1] : 0;
-		src2_data.z= src2_index+2 >= 0 ? src2[src2_index+2] : 0;
-		src2_data.w= src2_index+3 >= 0 ? src2[src2_index+3] : 0;
+        src2_data.x= src2_index+0 >= 0 ? src2[src2_index+0] : 0;
+        src2_data.y= src2_index+1 >= 0 ? src2[src2_index+1] : 0;
+        src2_data.z= src2_index+2 >= 0 ? src2[src2_index+2] : 0;
+        src2_data.w= src2_index+3 >= 0 ? src2[src2_index+3] : 0;

        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
 //        short4 tmp      = convert_short4_sat(src1_data) * alpha + convert_short4_sat(src2_data) * beta + gama;
@@ -118,14 +118,14 @@ __kernel void addWeighted_D2 (__global ushort *src1, int src1_step,int src1_offs
    int y = get_global_id(1);

    if (x < cols && y < rows)
-    
+
    {

        x = x << 2;

        #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -164,14 +164,14 @@ __kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offse
    int y = get_global_id(1);

    if (x < cols && y < rows)
-    
+
    {

        x = x << 2;

        #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -209,18 +209,18 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
    int y = get_global_id(1);

    if (x < cols && y < rows)
-    
+
    {
-            
+
        x = x << 2;

        #define bitOfInt  (sizeof(int)== 4 ? 2: 3)

        #define dst_align ((dst_offset >> bitOfInt) & 3)

-        int src1_index = mad24(y, src1_step, (x << bitOfInt) + src1_offset - (dst_align << bitOfInt)); 
-        int src2_index = mad24(y, src2_step, (x << bitOfInt) + src2_offset - (dst_align << bitOfInt)); 
-       
+        int src1_index = mad24(y, src1_step, (x << bitOfInt) + src1_offset - (dst_align << bitOfInt));
+        int src2_index = mad24(y, src2_step, (x << bitOfInt) + src2_offset - (dst_align << bitOfInt));
+
        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + (x << bitOfInt) -(dst_align << bitOfInt));
@@ -257,16 +257,16 @@ __kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset
    int y = get_global_id(1);

    if (x < cols && y < rows)
-    
+
    {
-            
+
        x = x << 2;

        #define dst_align ((dst_offset >> 2) & 3)

-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
-       
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
+
        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
@@ -305,16 +305,16 @@ __kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offs
    int y = get_global_id(1);

    if (x < cols && y < rows)
-    
+
    {
-            
+
        x = x << 2;

        #define dst_align ((dst_offset >> 3) & 3)

-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); 
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); 
-       
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
+        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
+
        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + (x << 3) -(dst_align << 3));
--- a/modules/ocl/src/kernels/arithm_add_scalar.cl
+++ b/modules/ocl/src/kernels/arithm_add_scalar.cl
@@ -60,21 +60,21 @@ __kernel void arithm_s_add_C1_D0 (__global   uchar *src1, int src1_step, int src
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
        int4 src2_data = (int4)(src2.x, src2.x, src2.x, src2.x);
-		if(src1_index < 0)
-		{
-			uchar4 tmp;
-			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-		}
-        
+        if(src1_index < 0)
+        {
+            uchar4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+
        uchar4 data = *((__global uchar4 *)(dst + dst_index));
        int4 tmp = convert_int4_sat(src1_data) + src2_data;
        uchar4 tmp_data = convert_uchar4_sat(tmp);
@@ -100,7 +100,7 @@ __kernel void arithm_s_add_C1_D2 (__global   ushort *src1, int src1_step, int sr
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -132,7 +132,7 @@ __kernel void arithm_s_add_C1_D3 (__global   short *src1, int src1_step, int src
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -234,7 +234,7 @@ __kernel void arithm_s_add_C2_D0 (__global   uchar *src1, int src1_step, int src
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -379,7 +379,7 @@ __kernel void arithm_s_add_C3_D0 (__global   uchar *src1, int src1_step, int src
        x = x << 2;

        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
-        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); 
+        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -389,9 +389,9 @@ __kernel void arithm_s_add_C3_D0 (__global   uchar *src1, int src1_step, int src
        uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
        uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);

-        int4 src2_data_0 = (int4)(src2.x, src2.y, src2.z, src2.x); 
+        int4 src2_data_0 = (int4)(src2.x, src2.y, src2.z, src2.x);
        int4 src2_data_1 = (int4)(src2.y, src2.z, src2.x, src2.y);
-        int4 src2_data_2 = (int4)(src2.z, src2.x, src2.y, src2.z); 
+        int4 src2_data_2 = (int4)(src2.z, src2.x, src2.y, src2.z);

        uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
        uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
@@ -402,17 +402,17 @@ __kernel void arithm_s_add_C3_D0 (__global   uchar *src1, int src1_step, int src
        uchar4 tmp_data_2 = convert_uchar4_sat(convert_int4_sat(src1_data_2) + src2_data_2);

        data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
-        data_0.w   = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+        data_0.w   = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
                     ? tmp_data_0.w : data_0.w;

-        data_1.xy  = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+        data_1.xy  = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
                     ? tmp_data_1.xy : data_1.xy;
-        data_1.zw  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.zw  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.zw : data_1.zw;

-        data_2.x   = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.x   = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.x : data_2.x;
-        data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) 
+        data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
                     ? tmp_data_2.yzw : data_2.yzw;

        *((__global uchar4 *)(dst + dst_index + 0)) = data_0;
@@ -433,7 +433,7 @@ __kernel void arithm_s_add_C3_D2 (__global   ushort *src1, int src1_step, int sr
        x = x << 1;

        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
-        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); 
+        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -457,12 +457,12 @@ __kernel void arithm_s_add_C3_D2 (__global   ushort *src1, int src1_step, int sr

        data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

-        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) 
+        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
                     ? tmp_data_1.x : data_1.x;
-        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.y : data_1.y;

-        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.xy : data_2.xy;

       *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -483,7 +483,7 @@ __kernel void arithm_s_add_C3_D3 (__global   short *src1, int src1_step, int src
        x = x << 1;

        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
-        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); 
+        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -507,12 +507,12 @@ __kernel void arithm_s_add_C3_D3 (__global   short *src1, int src1_step, int src

        data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

-        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) 
+        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
                     ? tmp_data_1.x : data_1.x;
-        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.y : data_1.y;

-        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.xy : data_2.xy;

       *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -530,7 +530,7 @@ __kernel void arithm_s_add_C3_D4 (__global   int *src1, int src1_step, int src1_

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); 
+        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));

        int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
@@ -564,13 +564,13 @@ __kernel void arithm_s_add_C3_D5 (__global   float *src1, int src1_step, int src

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); 
+        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));

        float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0));
        float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4));
        float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8));
-                                             
+
        float src2_data_0 = src2.x;
        float src2_data_1 = src2.y;
        float src2_data_2 = src2.z;
@@ -600,13 +600,13 @@ __kernel void arithm_s_add_C3_D6 (__global   double *src1, int src1_step, int sr

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); 
+        int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
        int dst_index  = mad24(y, dst_step, dst_offset + (x * 24));

        double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 ));
        double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 ));
        double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16));
-                                               
+
        double src2_data_0 = src2.x;
        double src2_data_1 = src2.y;
        double src2_data_2 = src2.z;
--- a/modules/ocl/src/kernels/arithm_add_scalar_mask.cl
+++ b/modules/ocl/src/kernels/arithm_add_scalar_mask.cl
@@ -62,29 +62,29 @@ __kernel void arithm_s_add_with_mask_C1_D0 (__global   uchar *src1, int src1_ste
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-		int mask_index_fix = mask_index < 0 ? 0 : mask_index;	
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int mask_index_fix = mask_index < 0 ? 0 : mask_index;
        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
        int4 src2_data = (int4)(src2.x, src2.x, src2.x, src2.x);
-        uchar4 mask_data = vload4(0, mask + mask_index_fix);		
-		if(src1_index < 0)
-		{
-			uchar4 tmp;
-			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-		}
-		if(mask_index < 0)
-		{
-			uchar4 tmp;
-			tmp.xyzw = (mask_index == -2) ? mask_data.zwxy:mask_data.yzwx;
-			mask_data.xyzw = (mask_index == -1) ? mask_data.wxyz:tmp.xyzw;
-		}	
+        uchar4 mask_data = vload4(0, mask + mask_index_fix);
+        if(src1_index < 0)
+        {
+            uchar4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(mask_index < 0)
+        {
+            uchar4 tmp;
+            tmp.xyzw = (mask_index == -2) ? mask_data.zwxy:mask_data.yzwx;
+            mask_data.xyzw = (mask_index == -1) ? mask_data.wxyz:tmp.xyzw;
+        }

        uchar4 data = *((__global uchar4 *)(dst + dst_index));
        int4 tmp = convert_int4_sat(src1_data) + src2_data;
@@ -112,7 +112,7 @@ __kernel void arithm_s_add_with_mask_C1_D2 (__global   ushort *src1, int src1_st
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -147,7 +147,7 @@ __kernel void arithm_s_add_with_mask_C1_D3 (__global   short *src1, int src1_ste
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -190,7 +190,7 @@ __kernel void arithm_s_add_with_mask_C1_D4 (__global   int   *src1, int src1_ste
        int dst_data  = *((__global int *)((__global char *)dst  + dst_index));

        int data = convert_int_sat((long)src_data1 + (long)src_data2);
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global int *)((__global char *)dst + dst_index)) = data;
    }
@@ -218,7 +218,7 @@ __kernel void arithm_s_add_with_mask_C1_D5 (__global   float   *src1, int src1_s
        float dst_data  = *((__global float *)((__global char *)dst  + dst_index));

        float data = src_data1 + src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global float *)((__global char *)dst + dst_index)) = data;
    }
@@ -248,7 +248,7 @@ __kernel void arithm_s_add_with_mask_C1_D6 (__global   double   *src1, int src1_
        double dst_data  = *((__global double *)((__global char *)dst  + dst_index));

        double data = src_data1 + src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global double *)((__global char *)dst + dst_index)) = data;
    }
@@ -268,7 +268,7 @@ __kernel void arithm_s_add_with_mask_C2_D0 (__global   uchar *src1, int src1_ste
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -307,12 +307,12 @@ __kernel void arithm_s_add_with_mask_C2_D2 (__global   ushort *src1, int src1_st
        uchar mask_data = *(mask + mask_index);

        ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = (int2)(src2.x, src2.y); 
+        int2 src_data2 = (int2)(src2.x, src2.y);
        ushort2 dst_data  = *((__global ushort2 *)((__global char *)dst  + dst_index));

        int2    tmp = convert_int2_sat(src_data1) + src_data2;
        ushort2 data = convert_ushort2_sat(tmp);
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
    }
@@ -335,12 +335,12 @@ __kernel void arithm_s_add_with_mask_C2_D3 (__global   short *src1, int src1_ste
        uchar mask_data = *(mask + mask_index);

        short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = (int2)(src2.x, src2.y); 
+        int2 src_data2 = (int2)(src2.x, src2.y);
        short2 dst_data  = *((__global short2 *)((__global char *)dst  + dst_index));

        int2    tmp = convert_int2_sat(src_data1) + src_data2;
        short2 data = convert_short2_sat(tmp);
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global short2 *)((__global char *)dst + dst_index)) = data;
    }
@@ -363,11 +363,11 @@ __kernel void arithm_s_add_with_mask_C2_D4 (__global   int *src1, int src1_step,
        uchar mask_data = *(mask + mask_index);

        int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = (int2)(src2.x, src2.y); 
+        int2 src_data2 = (int2)(src2.x, src2.y);
        int2 dst_data  = *((__global int2 *)((__global char *)dst  + dst_index));

        int2 data = convert_int2_sat(convert_long2_sat(src_data1) + convert_long2_sat(src_data2));
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global int2 *)((__global char *)dst + dst_index)) = data;
    }
@@ -390,11 +390,11 @@ __kernel void arithm_s_add_with_mask_C2_D5 (__global   float *src1, int src1_ste
        uchar mask_data = *(mask + mask_index);

        float2 src_data1 = *((__global float2 *)((__global char *)src1 + src1_index));
-        float2 src_data2 = (float2)(src2.x, src2.y); 
+        float2 src_data2 = (float2)(src2.x, src2.y);
        float2 dst_data  = *((__global float2 *)((__global char *)dst  + dst_index));

        float2 data = src_data1 + src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global float2 *)((__global char *)dst + dst_index)) = data;
    }
@@ -419,11 +419,11 @@ __kernel void arithm_s_add_with_mask_C2_D6 (__global   double *src1, int src1_st
        uchar mask_data = *(mask + mask_index);

        double2 src_data1 = *((__global double2 *)((__global char *)src1 + src1_index));
-        double2 src_data2 = (double2)(src2.x, src2.y); 
+        double2 src_data2 = (double2)(src2.x, src2.y);
        double2 dst_data  = *((__global double2 *)((__global char *)dst  + dst_index));

        double2 data = src_data1 + src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global double2 *)((__global char *)dst + dst_index)) = data;
    }
@@ -444,7 +444,7 @@ __kernel void arithm_s_add_with_mask_C3_D0 (__global   uchar *src1, int src1_ste
        x = x << 2;

        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
-        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); 
+        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -470,17 +470,17 @@ __kernel void arithm_s_add_with_mask_C3_D0 (__global   uchar *src1, int src1_ste
        uchar4 tmp_data_2 = convert_uchar4_sat(convert_int4_sat(src1_data_2) + src2_data_2);

        data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
-        data_0.w   = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+        data_0.w   = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
                     ? tmp_data_0.w : data_0.w;

-        data_1.xy  = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+        data_1.xy  = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
                     ? tmp_data_1.xy : data_1.xy;
-        data_1.zw  = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.zw  = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.zw : data_1.zw;

-        data_2.x   = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.x   = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.x : data_2.x;
-        data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) 
+        data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
                     ? tmp_data_2.yzw : data_2.yzw;

        *((__global uchar4 *)(dst + dst_index + 0)) = data_0;
@@ -502,7 +502,7 @@ __kernel void arithm_s_add_with_mask_C3_D2 (__global   ushort *src1, int src1_st
        x = x << 1;

        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
-        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); 
+        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -513,9 +513,9 @@ __kernel void arithm_s_add_with_mask_C3_D2 (__global   ushort *src1, int src1_st
        ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4));
        ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8));

-        int2 src2_data_0 = (int2)(src2.x, src2.y); 
-        int2 src2_data_1 = (int2)(src2.z, src2.x); 
-        int2 src2_data_2 = (int2)(src2.y, src2.z); 
+        int2 src2_data_0 = (int2)(src2.x, src2.y);
+        int2 src2_data_1 = (int2)(src2.z, src2.x);
+        int2 src2_data_2 = (int2)(src2.y, src2.z);

        uchar2 mask_data = vload2(0, mask + mask_index);

@@ -529,12 +529,12 @@ __kernel void arithm_s_add_with_mask_C3_D2 (__global   ushort *src1, int src1_st

        data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

-        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) 
+        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
                     ? tmp_data_1.x : data_1.x;
-        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.y : data_1.y;

-        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.xy : data_2.xy;

       *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -556,7 +556,7 @@ __kernel void arithm_s_add_with_mask_C3_D3 (__global   short *src1, int src1_ste
        x = x << 1;

        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
-        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); 
+        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -567,9 +567,9 @@ __kernel void arithm_s_add_with_mask_C3_D3 (__global   short *src1, int src1_ste
        short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4));
        short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8));

-        int2 src2_data_0 = (int2)(src2.x, src2.y); 
-        int2 src2_data_1 = (int2)(src2.z, src2.x); 
-        int2 src2_data_2 = (int2)(src2.y, src2.z); 
+        int2 src2_data_0 = (int2)(src2.x, src2.y);
+        int2 src2_data_1 = (int2)(src2.z, src2.x);
+        int2 src2_data_2 = (int2)(src2.y, src2.z);

        uchar2 mask_data = vload2(0, mask + mask_index);

@@ -583,12 +583,12 @@ __kernel void arithm_s_add_with_mask_C3_D3 (__global   short *src1, int src1_ste

        data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

-        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) 
+        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
                     ? tmp_data_1.x : data_1.x;
-        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.y : data_1.y;

-        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.xy : data_2.xy;

       *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -607,7 +607,7 @@ __kernel void arithm_s_add_with_mask_C3_D4 (__global   int *src1, int src1_step,

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); 
+        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
        int mask_index = mad24(y, mask_step, x + mask_offset);
        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));

@@ -615,9 +615,9 @@ __kernel void arithm_s_add_with_mask_C3_D4 (__global   int *src1, int src1_step,
        int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4));
        int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8));

-        int src2_data_0 = src2.x; 
+        int src2_data_0 = src2.x;
        int src2_data_1 = src2.y;
-        int src2_data_2 = src2.z; 
+        int src2_data_2 = src2.z;

        uchar mask_data = * (mask + mask_index);

@@ -649,17 +649,17 @@ __kernel void arithm_s_add_with_mask_C3_D5 (__global   float *src1, int src1_ste

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); 
+        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
        int mask_index = mad24(y, mask_step, x + mask_offset);
        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));

        float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0));
        float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4));
        float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8));
-                                             
-        float src2_data_0 = src2.x; 
+
+        float src2_data_0 = src2.x;
        float src2_data_1 = src2.y;
-        float src2_data_2 = src2.z; 
+        float src2_data_2 = src2.z;

        uchar mask_data = * (mask + mask_index);

@@ -693,17 +693,17 @@ __kernel void arithm_s_add_with_mask_C3_D6 (__global   double *src1, int src1_st

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); 
+        int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
        int mask_index = mad24(y, mask_step, x + mask_offset);
        int dst_index  = mad24(y, dst_step, dst_offset + (x * 24));

        double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 ));
        double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 ));
        double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16));
-                                               
-        double src2_data_0 = src2.x; 
+
+        double src2_data_0 = src2.x;
        double src2_data_1 = src2.y;
-        double src2_data_2 = src2.z; 
+        double src2_data_2 = src2.z;

        uchar mask_data = * (mask + mask_index);

@@ -747,7 +747,7 @@ __kernel void arithm_s_add_with_mask_C4_D0 (__global   uchar *src1, int src1_ste
        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));

        uchar4 data = convert_uchar4_sat(convert_int4_sat(src_data1) + src2);
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global uchar4 *)(dst + dst_index)) = data;
    }
@@ -773,7 +773,7 @@ __kernel void arithm_s_add_with_mask_C4_D2 (__global   ushort *src1, int src1_st
        ushort4 dst_data  = *((__global ushort4 *)((__global char *)dst  + dst_index));

        ushort4 data = convert_ushort4_sat(convert_int4_sat(src_data1) + src2);
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
    }
@@ -799,7 +799,7 @@ __kernel void arithm_s_add_with_mask_C4_D3 (__global   short *src1, int src1_ste
        short4 dst_data  = *((__global short4 *)((__global char *)dst  + dst_index));

        short4 data = convert_short4_sat(convert_int4_sat(src_data1) + src2);
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global short4 *)((__global char *)dst + dst_index)) = data;
    }
@@ -825,7 +825,7 @@ __kernel void arithm_s_add_with_mask_C4_D4 (__global   int *src1, int src1_step,
        int4 dst_data  = *((__global int4 *)((__global char *)dst  + dst_index));

        int4 data = convert_int4_sat(convert_long4_sat(src_data1) + convert_long4_sat(src2));
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global int4 *)((__global char *)dst + dst_index)) = data;
    }
@@ -851,7 +851,7 @@ __kernel void arithm_s_add_with_mask_C4_D5 (__global   float *src1, int src1_ste
        float4 dst_data  = *((__global float4 *)((__global char *)dst  + dst_index));

        float4 data = src_data1 + src2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global float4 *)((__global char *)dst + dst_index)) = data;
    }
@@ -879,7 +879,7 @@ __kernel void arithm_s_add_with_mask_C4_D6 (__global   double *src1, int src1_st
        double4 dst_data  = *((__global double4 *)((__global char *)dst  + dst_index));

        double4 data = src_data1 + src2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global double4 *)((__global char *)dst + dst_index)) = data;
    }
--- a/modules/ocl/src/kernels/arithm_bitwise_and.cl
+++ b/modules/ocl/src/kernels/arithm_bitwise_and.cl
@@ -63,8 +63,8 @@ __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int sr
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -99,8 +99,8 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -136,8 +136,8 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s
        x = x << 2;

        #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -174,8 +174,8 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr
        x = x << 2;

        #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
--- a/modules/ocl/src/kernels/arithm_bitwise_and_mask.cl
+++ b/modules/ocl/src/kernels/arithm_bitwise_and_mask.cl
@@ -65,8 +65,8 @@ __kernel void arithm_bitwise_and_with_mask_C1_D0 (__global uchar *src1, int src1
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -106,8 +106,8 @@ __kernel void arithm_bitwise_and_with_mask_C1_D1 (__global char *src1, int src1_
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -147,8 +147,8 @@ __kernel void arithm_bitwise_and_with_mask_C1_D2 (__global ushort *src1, int src
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -186,8 +186,8 @@ __kernel void arithm_bitwise_and_with_mask_C1_D3 (__global short *src1, int src1
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -198,8 +198,8 @@ __kernel void arithm_bitwise_and_with_mask_C1_D3 (__global short *src1, int src1
        short2 src2_data = vload2(0, (__global short *)((__global char *)src2 + src2_index));
        uchar2  mask_data = vload2(0, mask + mask_index);

-	short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
-	short2 tmp_data = src1_data & src2_data;
+    short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
+    short2 tmp_data = src1_data & src2_data;

        data.x = convert_short((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
        data.y = convert_short((mask_data.y) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : data.y;
@@ -234,7 +234,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D4 (__global int   *src1, int src1
        int dst_data  = *((__global int *)((__global char *)dst  + dst_index));

        int data = src_data1 & src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global int *)((__global char *)dst + dst_index)) = data;
    }
@@ -266,7 +266,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D5 (__global char *src1, int src1_
        char4 dst_data  = *((__global char4 *)((__global char *)dst  + dst_index));

        char4 data = src_data1 & src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global char4 *)((__global char *)dst + dst_index)) = data;
    }
@@ -299,7 +299,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D6 (__global char *src1, int src1_
        char8 dst_data  = *((__global char8 *)((__global char *)dst  + dst_index));

        char8 data = src_data1 & src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global char8 *)((__global char *)dst + dst_index)) = data;
    }
@@ -324,8 +324,8 @@ __kernel void arithm_bitwise_and_with_mask_C2_D0 (__global uchar *src1, int src1
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -362,8 +362,8 @@ __kernel void arithm_bitwise_and_with_mask_C2_D1 (__global char *src1, int src1_
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -408,7 +408,7 @@ __kernel void arithm_bitwise_and_with_mask_C2_D2 (__global ushort *src1, int src
        ushort2 dst_data  = *((__global ushort2 *)((__global char *)dst  + dst_index));

        ushort2 data = src_data1 & src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
    }
@@ -437,7 +437,7 @@ __kernel void arithm_bitwise_and_with_mask_C2_D3 (__global short *src1, int src1
        short2 dst_data  = *((__global short2 *)((__global char *)dst  + dst_index));

        short2 data = src_data1 & src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global short2 *)((__global char *)dst + dst_index)) = data;
    }
@@ -466,7 +466,7 @@ __kernel void arithm_bitwise_and_with_mask_C2_D4 (__global int   *src1, int src1
        int2 dst_data  = *((__global int2 *)((__global char *)dst  + dst_index));

        int2 data = src_data1 & src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global int2 *)((__global char *)dst + dst_index)) = data;
    }
@@ -495,7 +495,7 @@ __kernel void arithm_bitwise_and_with_mask_C2_D5 (__global char *src1, int src1_
        char8 dst_data  = *((__global char8 *)((__global char *)dst  + dst_index));

        char8 data = src_data1 & src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global char8 *)((__global char *)dst + dst_index)) = data;
    }
@@ -525,7 +525,7 @@ __kernel void arithm_bitwise_and_with_mask_C2_D6 (__global char *src1, int src1_
        char16 dst_data  = *((__global char16 *)((__global char *)dst  + dst_index));

        char16 data = src_data1 & src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global char16 *)((__global char *)dst + dst_index)) = data;
    }
@@ -549,8 +549,8 @@ __kernel void arithm_bitwise_and_with_mask_C3_D0 (__global uchar *src1, int src1
        x = x << 2;

        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
-        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); 
-        int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3)); 
+        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
+        int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -576,17 +576,17 @@ __kernel void arithm_bitwise_and_with_mask_C3_D0 (__global uchar *src1, int src1
        uchar4 tmp_data_2 =  src1_data_2 & src2_data_2;

        data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
-        data_0.w   = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+        data_0.w   = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
                     ? tmp_data_0.w : data_0.w;

-        data_1.xy  = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+        data_1.xy  = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
                     ? tmp_data_1.xy : data_1.xy;
-        data_1.zw  = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.zw  = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.zw : data_1.zw;

-        data_2.x   = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.x   = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.x : data_2.x;
-        data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) 
+        data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
                     ? tmp_data_2.yzw : data_2.yzw;

        *((__global uchar4 *)(dst + dst_index + 0)) = data_0;
@@ -611,8 +611,8 @@ __kernel void arithm_bitwise_and_with_mask_C3_D1 (__global char *src1, int src1_
        x = x << 2;

        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
-        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); 
-        int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3)); 
+        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
+        int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -638,17 +638,17 @@ __kernel void arithm_bitwise_and_with_mask_C3_D1 (__global char *src1, int src1_
        char4 tmp_data_2 =  src1_data_2 & src2_data_2;

        data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
-        data_0.w   = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+        data_0.w   = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
                     ? tmp_data_0.w : data_0.w;

-        data_1.xy  = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+        data_1.xy  = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
                     ? tmp_data_1.xy : data_1.xy;
-        data_1.zw  = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.zw  = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.zw : data_1.zw;

-        data_2.x   = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.x   = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.x : data_2.x;
-        data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) 
+        data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
                     ? tmp_data_2.yzw : data_2.yzw;

        *((__global char4 *)(dst + dst_index + 0)) = data_0;
@@ -672,8 +672,8 @@ __kernel void arithm_bitwise_and_with_mask_C3_D2 (__global ushort *src1, int src
        x = x << 1;

        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
-        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); 
-        int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6)); 
+        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
+        int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -700,12 +700,12 @@ __kernel void arithm_bitwise_and_with_mask_C3_D2 (__global ushort *src1, int src

        data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

-        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) 
+        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
                     ? tmp_data_1.x : data_1.x;
-        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.y : data_1.y;

-        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.xy : data_2.xy;

       *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -728,8 +728,8 @@ __kernel void arithm_bitwise_and_with_mask_C3_D3 (__global short *src1, int src1
        x = x << 1;

        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
-        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); 
-        int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6)); 
+        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
+        int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -756,12 +756,12 @@ __kernel void arithm_bitwise_and_with_mask_C3_D3 (__global short *src1, int src1

        data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

-        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) 
+        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
                     ? tmp_data_1.x : data_1.x;
-        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.y : data_1.y;

-        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.xy : data_2.xy;

       *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -781,8 +781,8 @@ __kernel void arithm_bitwise_and_with_mask_C3_D4 (__global int   *src1, int src1

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); 
-        int src2_index = mad24(y, src2_step, (x * 12) + src2_offset); 
+        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
+        int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
        int mask_index = mad24(y, mask_step, x + mask_offset);
        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));

@@ -825,15 +825,15 @@ __kernel void arithm_bitwise_and_with_mask_C3_D5 (__global char *src1, int src1_

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); 
-        int src2_index = mad24(y, src2_step, (x * 12) + src2_offset); 
+        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
+        int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
        int mask_index = mad24(y, mask_step, x + mask_offset);
        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));

        char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0));
        char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4));
        char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8));
-                                             
+
        char4 src2_data_0 = *((__global char4 *)((__global char *)src2 + src2_index + 0));
        char4 src2_data_1 = *((__global char4 *)((__global char *)src2 + src2_index + 4));
        char4 src2_data_2 = *((__global char4 *)((__global char *)src2 + src2_index + 8));
@@ -870,15 +870,15 @@ __kernel void arithm_bitwise_and_with_mask_C3_D6 (__global char *src1, int src1_

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); 
-        int src2_index = mad24(y, src2_step, (x * 24) + src2_offset); 
+        int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
+        int src2_index = mad24(y, src2_step, (x * 24) + src2_offset);
        int mask_index = mad24(y, mask_step, x + mask_offset);
        int dst_index  = mad24(y, dst_step, dst_offset + (x * 24));

        char8 src1_data_0 = *((__global char8 *)((__global char *)src1 + src1_index + 0 ));
        char8 src1_data_1 = *((__global char8 *)((__global char *)src1 + src1_index + 8 ));
        char8 src1_data_2 = *((__global char8 *)((__global char *)src1 + src1_index + 16));
-                                               
+
        char8 src2_data_0 = *((__global char8 *)((__global char *)src2 + src2_index + 0 ));
        char8 src2_data_1 = *((__global char8 *)((__global char *)src2 + src2_index + 8 ));
        char8 src2_data_2 = *((__global char8 *)((__global char *)src2 + src2_index + 16));
@@ -930,7 +930,7 @@ __kernel void arithm_bitwise_and_with_mask_C4_D0 (__global uchar *src1, int src1
        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));

        uchar4 data = src_data1 & src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global uchar4 *)(dst + dst_index)) = data;
    }
@@ -961,7 +961,7 @@ __kernel void arithm_bitwise_and_with_mask_C4_D1 (__global char *src1, int src1_
        char4 dst_data  = *((__global char4 *)(dst  + dst_index));

        char4 data = src_data1 & src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global char4 *)(dst + dst_index)) = data;
    }
@@ -991,7 +991,7 @@ __kernel void arithm_bitwise_and_with_mask_C4_D2 (__global ushort *src1, int src
        ushort4 dst_data  = *((__global ushort4 *)((__global char *)dst  + dst_index));

        ushort4 data = src_data1 & src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
    }
@@ -1020,7 +1020,7 @@ __kernel void arithm_bitwise_and_with_mask_C4_D3 (__global short *src1, int src1
        short4 dst_data  = *((__global short4 *)((__global char *)dst  + dst_index));

        short4 data = src_data1 & src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global short4 *)((__global char *)dst + dst_index)) = data;
    }
@@ -1049,7 +1049,7 @@ __kernel void arithm_bitwise_and_with_mask_C4_D4 (__global int   *src1, int src1
        int4 dst_data  = *((__global int4 *)((__global char *)dst  + dst_index));

        int4 data = src_data1 & src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global int4 *)((__global char *)dst + dst_index)) = data;
    }
@@ -1078,7 +1078,7 @@ __kernel void arithm_bitwise_and_with_mask_C4_D5 (__global char *src1, int src1_
        char16 dst_data  = *((__global char16 *)((__global char *)dst  + dst_index));

        char16 data = src_data1 & src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global char16 *)((__global char *)dst + dst_index)) = data;
    }
@@ -1123,10 +1123,10 @@ __kernel void arithm_bitwise_and_with_mask_C4_D6 (__global char *src1, int src1_
        char8 data_2 = src_data1_2 & src_data2_2;
        char8 data_3 = src_data1_3 & src_data2_3;

-        data_0 = mask_data ? data_0 : dst_data_0; 
-        data_1 = mask_data ? data_1 : dst_data_1; 
-        data_2 = mask_data ? data_2 : dst_data_2; 
-        data_3 = mask_data ? data_3 : dst_data_3; 
+        data_0 = mask_data ? data_0 : dst_data_0;
+        data_1 = mask_data ? data_1 : dst_data_1;
+        data_2 = mask_data ? data_2 : dst_data_2;
+        data_3 = mask_data ? data_3 : dst_data_3;

        *((__global char8 *)((__global char *)dst + dst_index + 0)) = data_0;
        *((__global char8 *)((__global char *)dst + dst_index + 8)) = data_1;
--- a/modules/ocl/src/kernels/arithm_bitwise_and_scalar.cl
+++ b/modules/ocl/src/kernels/arithm_bitwise_and_scalar.cl
@@ -64,7 +64,7 @@ __kernel void arithm_s_bitwise_and_C1_D0 (__global   uchar *src1, int src1_step,
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -98,7 +98,7 @@ __kernel void arithm_s_bitwise_and_C1_D1 (__global   char *src1, int src1_step,
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -132,7 +132,7 @@ __kernel void arithm_s_bitwise_and_C1_D2 (__global   ushort *src1, int src1_step
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -163,7 +163,7 @@ __kernel void arithm_s_bitwise_and_C1_D3 (__global   short *src1, int src1_step,
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -249,7 +249,7 @@ __kernel void arithm_s_bitwise_and_C1_D6 (__global short *src1, int src1_step, i

        short4 src1_data = *((__global short4 *)((__global char *)src1 + src1_index));
        short4 src2_data = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
-        
+
        short4 tmp_data = src1_data & src2_data;

        *((__global short4 *)((__global char *)dst + dst_index)) = tmp_data;
@@ -269,7 +269,7 @@ __kernel void arithm_s_bitwise_and_C2_D0 (__global   uchar *src1, int src1_step,
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -280,7 +280,7 @@ __kernel void arithm_s_bitwise_and_C2_D0 (__global   uchar *src1, int src1_step,

        uchar4 data = *((__global uchar4 *)(dst + dst_index));
        uchar4 tmp_data = src1_data & src2_data;
-        
+

        data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy;
        data.zw = (dst_index + 2 <  dst_end  ) ? tmp_data.zw : data.zw;
@@ -303,7 +303,7 @@ __kernel void arithm_s_bitwise_and_C2_D1 (__global   char *src1, int src1_step,
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -311,10 +311,10 @@ __kernel void arithm_s_bitwise_and_C2_D1 (__global   char *src1, int src1_step,

        char4 src1_data = vload4(0, src1 + src1_index);
        char4 src2_data = (char4)(src2.x, src2.y, src2.x, src2.y);
-        
+
        char4 data = *((__global char4 *)(dst + dst_index));
        char4 tmp_data = src1_data & src2_data;
-        
+
        data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy;
        data.zw = (dst_index + 2 <  dst_end  ) ? tmp_data.zw : data.zw;

@@ -339,7 +339,7 @@ __kernel void arithm_s_bitwise_and_C2_D2 (__global   ushort *src1, int src1_step
        ushort2 src_data2 = (ushort2)(src2.x, src2.y);

        ushort2 data = src_data1 & src_data2;
-        
+
        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
    }
 }
@@ -360,7 +360,7 @@ __kernel void arithm_s_bitwise_and_C2_D3 (__global   short *src1, int src1_step,
        short2 src_data2 = (short2)(src2.x, src2.y);

        short2 data = src_data1 & src_data2;
-        
+
        *((__global short2 *)((__global char *)dst + dst_index)) = data;
    }
 }
@@ -401,7 +401,7 @@ __kernel void arithm_s_bitwise_and_C2_D5 (__global   char *src1, int src1_step,
        char8 src2_data = (char8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);

        char8 tmp_data = src1_data & src2_data;
-        
+
        *((__global char8 *)((__global char *)dst + dst_index)) = tmp_data;
      }
 }
@@ -423,7 +423,7 @@ __kernel void arithm_s_bitwise_and_C2_D6 (__global short *src1, int src1_step, i
        short8 src2_data = (short8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);

        short8 tmp_data = src1_data & src2_data;
-        
+
        *((__global short8 *)((__global char *)dst + dst_index)) = tmp_data;
    }
 }
@@ -441,7 +441,7 @@ __kernel void arithm_s_bitwise_and_C3_D0 (__global   uchar *src1, int src1_step,
        x = x << 2;

        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
-        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); 
+        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -451,9 +451,9 @@ __kernel void arithm_s_bitwise_and_C3_D0 (__global   uchar *src1, int src1_step,
        uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
        uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);

-        uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x); 
+        uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x);
        uchar4 src2_data_1 = (uchar4)(src2.y, src2.z, src2.x, src2.y);
-        uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z); 
+        uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z);

        uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
        uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
@@ -462,19 +462,19 @@ __kernel void arithm_s_bitwise_and_C3_D0 (__global   uchar *src1, int src1_step,
        uchar4 tmp_data_0 = src1_data_0 & src2_data_0;
        uchar4 tmp_data_1 = src1_data_1 & src2_data_1;
        uchar4 tmp_data_2 = src1_data_2 & src2_data_2;
-        
+
        data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
-        data_0.w   = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+        data_0.w   = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
                     ? tmp_data_0.w : data_0.w;

-        data_1.xy  = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+        data_1.xy  = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
                     ? tmp_data_1.xy : data_1.xy;
-        data_1.zw  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.zw  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.zw : data_1.zw;

-        data_2.x   = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.x   = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.x : data_2.x;
-        data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) 
+        data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
                     ? tmp_data_2.yzw : data_2.yzw;

        *((__global uchar4 *)(dst + dst_index + 0)) = data_0;
@@ -497,7 +497,7 @@ __kernel void arithm_s_bitwise_and_C3_D1 (__global   char *src1, int src1_step,
        x = x << 2;

        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
-        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); 
+        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -507,9 +507,9 @@ __kernel void arithm_s_bitwise_and_C3_D1 (__global   char *src1, int src1_step,
        char4 src1_data_1 = vload4(0, src1 + src1_index + 4);
        char4 src1_data_2 = vload4(0, src1 + src1_index + 8);

-        char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x); 
+        char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x);
        char4 src2_data_1 = (char4)(src2.y, src2.z, src2.x, src2.y);
-        char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z); 
+        char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z);

        char4 data_0 = *((__global char4 *)(dst + dst_index + 0));
        char4 data_1 = *((__global char4 *)(dst + dst_index + 4));
@@ -520,17 +520,17 @@ __kernel void arithm_s_bitwise_and_C3_D1 (__global   char *src1, int src1_step,
        char4 tmp_data_2 = convert_char4_sat(convert_uchar4_sat(src1_data_2) & convert_uchar4_sat(src2_data_2));

        data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
-        data_0.w   = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+        data_0.w   = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
                     ? tmp_data_0.w : data_0.w;

-        data_1.xy  = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+        data_1.xy  = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
                     ? tmp_data_1.xy : data_1.xy;
-        data_1.zw  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.zw  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.zw : data_1.zw;

-        data_2.x   = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.x   = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.x : data_2.x;
-        data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) 
+        data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
                     ? tmp_data_2.yzw : data_2.yzw;

        *((__global char4 *)(dst + dst_index + 0)) = data_0;
@@ -552,7 +552,7 @@ __kernel void arithm_s_bitwise_and_C3_D2 (__global   ushort *src1, int src1_step
        x = x << 1;

        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
-        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); 
+        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -576,12 +576,12 @@ __kernel void arithm_s_bitwise_and_C3_D2 (__global   ushort *src1, int src1_step

        data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

-        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) 
+        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
                     ? tmp_data_1.x : data_1.x;
-        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.y : data_1.y;

-        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.xy : data_2.xy;

       *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -602,7 +602,7 @@ __kernel void arithm_s_bitwise_and_C3_D3 (__global   short *src1, int src1_step,
        x = x << 1;

        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
-        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); 
+        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -626,12 +626,12 @@ __kernel void arithm_s_bitwise_and_C3_D3 (__global   short *src1, int src1_step,

        data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

-        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) 
+        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
                     ? tmp_data_1.x : data_1.x;
-        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.y : data_1.y;

-        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.xy : data_2.xy;

       *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -649,7 +649,7 @@ __kernel void arithm_s_bitwise_and_C3_D4 (__global   int *src1, int src1_step, i

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); 
+        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));

        int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
@@ -683,16 +683,16 @@ __kernel void arithm_s_bitwise_and_C3_D5 (__global   char *src1, int src1_step,

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); 
+        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));

        char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0));
        char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4));
        char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8));
-                                             
-        char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3); 
+
+        char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
        char4 src2_data_1 = (char4)(src2.s4, src2.s5, src2.s6, src2.s7);
-        char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB); 
+        char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB);

        char4 data_0 = *((__global char4 *)((__global char *)dst + dst_index + 0));
        char4 data_1 = *((__global char4 *)((__global char *)dst + dst_index + 4));
@@ -718,13 +718,13 @@ __kernel void arithm_s_bitwise_and_C3_D6 (__global short *src1, int src1_step, i

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); 
+        int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
        int dst_index  = mad24(y, dst_step, dst_offset + (x * 24));

        short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0 ));
        short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8 ));
        short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16));
-                                               
+
        short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
        short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
        short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
@@ -736,7 +736,7 @@ __kernel void arithm_s_bitwise_and_C3_D6 (__global short *src1, int src1_step, i
        short4 tmp_data_0 = src1_data_0 & src2_data_0;
        short4 tmp_data_1 = src1_data_1 & src2_data_1;
        short4 tmp_data_2 = src1_data_2 & src2_data_2;
-        
+
       *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
       *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
       *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
@@ -864,7 +864,7 @@ __kernel void arithm_s_bitwise_and_C4_D5 (__global   char *src1, int src1_step,
                                    src2.s8, src2.s9, src2.sa, src2.sb, src2.sc, src2.sd, src2.se, src2.sf);

        char16 tmp_data = src1_data & src2_data;
-        
+
        *((__global char16 *)((__global char *)dst + dst_index)) = tmp_data;
    }
 }
@@ -891,17 +891,17 @@ __kernel void arithm_s_bitwise_and_C4_D6 (__global short *src1, int src1_step, i
        short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
        short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
        short4 src2_data_3 = (short4)(src2.sc, src2.sd, src2.se, src2.sf);
-        
+
        short4 tmp_data_0 = src1_data_0 & src2_data_0;
        short4 tmp_data_1 = src1_data_1 & src2_data_1;
        short4 tmp_data_2 = src1_data_2 & src2_data_2;
        short4 tmp_data_3 = src1_data_3 & src2_data_3;
-        
+
       *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
       *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
       *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
       *((__global short4 *)((__global char *)dst + dst_index + 24))= tmp_data_3;
-       
+
    }
 }
 #endif
--- a/modules/ocl/src/kernels/arithm_bitwise_and_scalar_mask.cl
+++ b/modules/ocl/src/kernels/arithm_bitwise_and_scalar_mask.cl
@@ -66,7 +66,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D0 (__global   uchar *src1, int
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -104,7 +104,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D1 (__global   char *src1, int s
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -141,7 +141,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D2 (__global   ushort *src1, int
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -154,7 +154,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D2 (__global   ushort *src1, int

        ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
        ushort2 tmp_data = src1_data & src2_data;
-        
+
        data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x;
        data.y = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.y : data.y;

@@ -175,7 +175,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D3 (__global   short *src1, int
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -217,7 +217,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D4 (__global   int   *src1, int
        int dst_data  = *((__global int *)((__global char *)dst  + dst_index));

        int data = src_data1 & src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global int *)((__global char *)dst + dst_index)) = data;
    }
@@ -245,7 +245,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D5 (__global char *src1, int src
        char4 dst_data  = *((__global char4 *)((__global char *)dst  + dst_index));

        char4 data = src1_data & src2_data;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global char4 *)((__global char *)dst + dst_index)) = data;
    }
@@ -274,7 +274,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D6 (__global short *src1, int sr
        short4 dst_data  = *((__global short4 *)((__global char *)dst  + dst_index));

        short4 data = src1_data & src2_data;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global short4 *)((__global char *)dst + dst_index)) = data;
    }
@@ -294,7 +294,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D0 (__global   uchar *src1, int
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -330,7 +330,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D1 (__global   char *src1, int s
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -373,7 +373,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D2 (__global   ushort *src1, int
        ushort2 dst_data  = *((__global ushort2 *)((__global char *)dst  + dst_index));

        ushort2 data = src_data1 & src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
    }
@@ -400,7 +400,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D3 (__global   short *src1, int
        short2 dst_data  = *((__global short2 *)((__global char *)dst  + dst_index));

        short2 data = src_data1 & src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global short2 *)((__global char *)dst + dst_index)) = data;
    }
@@ -427,7 +427,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D4 (__global   int *src1, int sr
        int2 dst_data  = *((__global int2 *)((__global char *)dst  + dst_index));

        int2 data = src_data1 & src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global int2 *)((__global char *)dst + dst_index)) = data;
    }
@@ -454,7 +454,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D5 (__global   char *src1, int s
        char8 dst_data = *((__global char8 *)((__global char *)dst  + dst_index));

        char8 data = src1_data & src2_data;
-        
+
        data = mask_data ? data : dst_data;

        *((__global char8 *)((__global char *)dst + dst_index)) = data;
@@ -483,7 +483,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D6 (__global short *src1, int sr
        short8 dst_data = *((__global short8 *)((__global char *)dst  + dst_index));

        short8 data = src1_data & src2_data;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global short8 *)((__global char *)dst + dst_index)) = data;
    }
@@ -503,7 +503,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D0 (__global   uchar *src1, int
        x = x << 2;

        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
-        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); 
+        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -514,9 +514,9 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D0 (__global   uchar *src1, int
        uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
        uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);

-        uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x); 
+        uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x);
        uchar4 src2_data_1 = (uchar4)(src2.y, src2.z, src2.x, src2.y);
-        uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z); 
+        uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z);

        uchar4 mask_data = vload4(0, mask + mask_index);

@@ -529,17 +529,17 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D0 (__global   uchar *src1, int
        uchar4 tmp_data_2 = src1_data_2 & src2_data_2;

        data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
-        data_0.w   = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+        data_0.w   = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
                     ? tmp_data_0.w : data_0.w;

-        data_1.xy  = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+        data_1.xy  = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
                     ? tmp_data_1.xy : data_1.xy;
-        data_1.zw  = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.zw  = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.zw : data_1.zw;

-        data_2.x   = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.x   = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.x : data_2.x;
-        data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) 
+        data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
                     ? tmp_data_2.yzw : data_2.yzw;

        *((__global uchar4 *)(dst + dst_index + 0)) = data_0;
@@ -563,7 +563,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D1 (__global   char *src1, int s
        x = x << 2;

        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
-        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); 
+        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -574,9 +574,9 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D1 (__global   char *src1, int s
        char4 src1_data_1 = vload4(0, src1 + src1_index + 4);
        char4 src1_data_2 = vload4(0, src1 + src1_index + 8);

-        char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x); 
+        char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x);
        char4 src2_data_1 = (char4)(src2.y, src2.z, src2.x, src2.y);
-        char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z); 
+        char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z);

        uchar4 mask_data = vload4(0, mask + mask_index);

@@ -587,19 +587,19 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D1 (__global   char *src1, int s
        char4 tmp_data_0 = src1_data_0 & src2_data_0;
        char4 tmp_data_1 = src1_data_1 & src2_data_1;
        char4 tmp_data_2 = src1_data_2 & src2_data_2;
-        
+
        data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
-        data_0.w   = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+        data_0.w   = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
                     ? tmp_data_0.w : data_0.w;

-        data_1.xy  = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+        data_1.xy  = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
                     ? tmp_data_1.xy : data_1.xy;
-        data_1.zw  = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.zw  = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.zw : data_1.zw;

-        data_2.x   = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.x   = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.x : data_2.x;
-        data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) 
+        data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
                     ? tmp_data_2.yzw : data_2.yzw;

        *((__global char4 *)(dst + dst_index + 0)) = data_0;
@@ -622,7 +622,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D2 (__global   ushort *src1, int
        x = x << 1;

        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
-        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); 
+        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -646,15 +646,15 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D2 (__global   ushort *src1, int
        ushort2 tmp_data_0 = src1_data_0 & src2_data_0;
        ushort2 tmp_data_1 = src1_data_1 & src2_data_1;
        ushort2 tmp_data_2 = src1_data_2 & src2_data_2;
-        
+
        data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

-        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) 
+        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
                     ? tmp_data_1.x : data_1.x;
-        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.y : data_1.y;

-        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.xy : data_2.xy;

       *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -676,7 +676,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D3 (__global   short *src1, int
        x = x << 1;

        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
-        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); 
+        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -703,12 +703,12 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D3 (__global   short *src1, int

        data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

-        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) 
+        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
                     ? tmp_data_1.x : data_1.x;
-        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.y : data_1.y;

-        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.xy : data_2.xy;

       *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -727,7 +727,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D4 (__global   int *src1, int sr

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); 
+        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
        int mask_index = mad24(y, mask_step, x + mask_offset);
        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));

@@ -769,18 +769,18 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D5 (__global   char *src1, int s

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); 
+        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
        int mask_index = mad24(y, mask_step, x + mask_offset);
        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));

        char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0));
        char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4));
        char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8));
-                                             
-        char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3); 
+
+        char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
        char4 src2_data_1 = (char4)(src2.s4, src2.s5, src2.s6, src2.s7);
-        char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB); 
-        
+        char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB);
+
        uchar mask_data = * (mask + mask_index);

        char4 data_0 = *((__global char4 *)((__global char *)dst + dst_index + 0));
@@ -812,18 +812,18 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D6 (__global short *src1, int sr

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); 
+        int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
        int mask_index = mad24(y, mask_step, x + mask_offset);
        int dst_index  = mad24(y, dst_step, dst_offset + (x * 24));

        short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0 ));
        short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8 ));
        short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16));
-                                               
+
        short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
        short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
        short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
-        
+
        uchar mask_data = * (mask + mask_index);

        short4 data_0 = *((__global short4 *)((__global char *)dst + dst_index + 0 ));
@@ -833,7 +833,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D6 (__global short *src1, int sr
        short4 tmp_data_0 = src1_data_0 & src2_data_0;
        short4 tmp_data_1 = src1_data_1 & src2_data_1;
        short4 tmp_data_2 = src1_data_2 & src2_data_2;
-        
+
        data_0 = mask_data ? tmp_data_0 : data_0;
        data_1 = mask_data ? tmp_data_1 : data_1;
        data_2 = mask_data ? tmp_data_2 : data_2;
@@ -865,7 +865,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D0 (__global   uchar *src1, int
        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));

        uchar4 data = src_data1 & src2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global uchar4 *)(dst + dst_index)) = data;
    }
@@ -893,7 +893,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D1 (__global   char *src1, int s
        char4 dst_data  = *((__global char4 *)(dst  + dst_index));

        char4 data = src_data1 & src2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global char4 *)(dst + dst_index)) = data;
    }
@@ -920,7 +920,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D2 (__global   ushort *src1, int
        ushort4 dst_data  = *((__global ushort4 *)((__global char *)dst  + dst_index));

        ushort4 data = src_data1 & src2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
    }
@@ -946,7 +946,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D3 (__global   short *src1, int
        short4 dst_data  = *((__global short4 *)((__global char *)dst  + dst_index));

        short4 data = src_data1 & src2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global short4 *)((__global char *)dst + dst_index)) = data;
    }
@@ -972,7 +972,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D4 (__global   int *src1, int sr
        int4 dst_data  = *((__global int4 *)((__global char *)dst  + dst_index));

        int4 data = src_data1 & src2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global int4 *)((__global char *)dst + dst_index)) = data;
    }
@@ -1000,7 +1000,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D5 (__global   char *src1, int s
        char16 dst_data  = *((__global char16 *)((__global char *)dst  + dst_index));

        char16 data = src1_data & src2_data;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global char16 *)((__global char *)dst + dst_index)) = data;
    }
@@ -1032,7 +1032,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D6 (__global short *src1, int sr
        short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
        short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
        short4 src2_data_3 = (short4)(src2.sc, src2.sd, src2.se, src2.sf);
-        
+
        short4 dst_data_0  = *((__global short4 *)((__global char *)dst  + dst_index + 0));
        short4 dst_data_1  = *((__global short4 *)((__global char *)dst  + dst_index + 8));
        short4 dst_data_2  = *((__global short4 *)((__global char *)dst  + dst_index + 16));
@@ -1042,10 +1042,10 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D6 (__global short *src1, int sr
        short4 data_1 = src1_data_1 & src2_data_1;
        short4 data_2 = src1_data_2 & src2_data_2;
        short4 data_3 = src1_data_3 & src2_data_3;
-	      
-        data_0 = mask_data ? data_0 : dst_data_0; 
-        data_1 = mask_data ? data_1 : dst_data_1; 
-        data_2 = mask_data ? data_2 : dst_data_2; 
+
+        data_0 = mask_data ? data_0 : dst_data_0;
+        data_1 = mask_data ? data_1 : dst_data_1;
+        data_2 = mask_data ? data_2 : dst_data_2;
        data_3 = mask_data ? data_3 : dst_data_3;

        *((__global short4 *)((__global char *)dst + dst_index + 0)) = data_0;
--- a/modules/ocl/src/kernels/arithm_bitwise_not.cl
+++ b/modules/ocl/src/kernels/arithm_bitwise_not.cl
@@ -62,7 +62,7 @@ __kernel void arithm_bitwise_not_D0 (__global uchar *src1, int src1_step, int sr
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -95,7 +95,7 @@ __kernel void arithm_bitwise_not_D1 (__global char *src1, int src1_step, int src
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -129,7 +129,7 @@ __kernel void arithm_bitwise_not_D2 (__global ushort *src1, int src1_step, int s
        x = x << 2;

        #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -164,7 +164,7 @@ __kernel void arithm_bitwise_not_D3 (__global short *src1, int src1_step, int sr
        x = x << 2;

        #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -238,12 +238,12 @@ __kernel void arithm_bitwise_not_D6 (__global char *src, int src_step, int src_o
    {
        int src_index = mad24(y, src_step, (x << 3) + src_offset);
        int dst_index = mad24(y, dst_step,  (x << 3) + dst_offset);
-         
+
        char8 data;

        data = *((__global char8 *)((__global char *)src + src_index));
        data = ~ data;
-        
+
        *((__global char8 *)((__global char *)dst + dst_index)) = data;
    }
 }
--- a/modules/ocl/src/kernels/arithm_bitwise_or.cl
+++ b/modules/ocl/src/kernels/arithm_bitwise_or.cl
@@ -63,8 +63,8 @@ __kernel void arithm_bitwise_or_D0 (__global uchar *src1, int src1_step, int src
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -99,8 +99,8 @@ __kernel void arithm_bitwise_or_D1 (__global char *src1, int src1_step, int src1
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -136,8 +136,8 @@ __kernel void arithm_bitwise_or_D2 (__global ushort *src1, int src1_step, int sr
        x = x << 2;

        #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -174,8 +174,8 @@ __kernel void arithm_bitwise_or_D3 (__global short *src1, int src1_step, int src
        x = x << 2;

        #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
--- a/modules/ocl/src/kernels/arithm_bitwise_or_mask.cl
+++ b/modules/ocl/src/kernels/arithm_bitwise_or_mask.cl
@@ -65,8 +65,8 @@ __kernel void arithm_bitwise_or_with_mask_C1_D0 (__global uchar *src1, int src1_
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -106,8 +106,8 @@ __kernel void arithm_bitwise_or_with_mask_C1_D1 (__global char *src1, int src1_s
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -147,8 +147,8 @@ __kernel void arithm_bitwise_or_with_mask_C1_D2 (__global ushort *src1, int src1
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -186,8 +186,8 @@ __kernel void arithm_bitwise_or_with_mask_C1_D3 (__global short *src1, int src1_
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -198,8 +198,8 @@ __kernel void arithm_bitwise_or_with_mask_C1_D3 (__global short *src1, int src1_
        short2 src2_data = vload2(0, (__global short *)((__global char *)src2 + src2_index));
        uchar2  mask_data = vload2(0, mask + mask_index);

-	short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
-	short2 tmp_data = src1_data | src2_data;
+    short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
+    short2 tmp_data = src1_data | src2_data;

        data.x = convert_short((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
        data.y = convert_short((mask_data.y) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : data.y;
@@ -234,7 +234,7 @@ __kernel void arithm_bitwise_or_with_mask_C1_D4 (__global int   *src1, int src1_
        int dst_data  = *((__global int *)((__global char *)dst  + dst_index));

        int data = src_data1 | src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global int *)((__global char *)dst + dst_index)) = data;
    }
@@ -266,7 +266,7 @@ __kernel void arithm_bitwise_or_with_mask_C1_D5 (__global char *src1, int src1_s
        char4 dst_data  = *((__global char4 *)((__global char *)dst  + dst_index));

        char4 data = src_data1 | src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global char4 *)((__global char *)dst + dst_index)) = data;
    }
@@ -299,7 +299,7 @@ __kernel void arithm_bitwise_or_with_mask_C1_D6 (__global char *src1, int src1_s
        char8 dst_data  = *((__global char8 *)((__global char *)dst  + dst_index));

        char8 data = src_data1 | src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global char8 *)((__global char *)dst + dst_index)) = data;
    }
@@ -324,8 +324,8 @@ __kernel void arithm_bitwise_or_with_mask_C2_D0 (__global uchar *src1, int src1_
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -362,8 +362,8 @@ __kernel void arithm_bitwise_or_with_mask_C2_D1 (__global char *src1, int src1_s
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -408,7 +408,7 @@ __kernel void arithm_bitwise_or_with_mask_C2_D2 (__global ushort *src1, int src1
        ushort2 dst_data  = *((__global ushort2 *)((__global char *)dst  + dst_index));

        ushort2 data = src_data1 | src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
    }
@@ -437,7 +437,7 @@ __kernel void arithm_bitwise_or_with_mask_C2_D3 (__global short *src1, int src1_
        short2 dst_data  = *((__global short2 *)((__global char *)dst  + dst_index));

        short2 data = src_data1 | src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global short2 *)((__global char *)dst + dst_index)) = data;
    }
@@ -466,7 +466,7 @@ __kernel void arithm_bitwise_or_with_mask_C2_D4 (__global int   *src1, int src1_
        int2 dst_data  = *((__global int2 *)((__global char *)dst  + dst_index));

        int2 data = src_data1 | src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global int2 *)((__global char *)dst + dst_index)) = data;
    }
@@ -495,7 +495,7 @@ __kernel void arithm_bitwise_or_with_mask_C2_D5 (__global char *src1, int src1_s
        char8 dst_data  = *((__global char8 *)((__global char *)dst  + dst_index));

        char8 data = src_data1 | src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global char8 *)((__global char *)dst + dst_index)) = data;
    }
@@ -525,7 +525,7 @@ __kernel void arithm_bitwise_or_with_mask_C2_D6 (__global char *src1, int src1_s
        char16 dst_data  = *((__global char16 *)((__global char *)dst  + dst_index));

        char16 data = src_data1 | src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global char16 *)((__global char *)dst + dst_index)) = data;
    }
@@ -549,8 +549,8 @@ __kernel void arithm_bitwise_or_with_mask_C3_D0 (__global uchar *src1, int src1_
        x = x << 2;

        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
-        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); 
-        int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3)); 
+        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
+        int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -576,17 +576,17 @@ __kernel void arithm_bitwise_or_with_mask_C3_D0 (__global uchar *src1, int src1_
        uchar4 tmp_data_2 =  src1_data_2 | src2_data_2;

        data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
-        data_0.w   = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+        data_0.w   = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
                     ? tmp_data_0.w : data_0.w;

-        data_1.xy  = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+        data_1.xy  = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
                     ? tmp_data_1.xy : data_1.xy;
-        data_1.zw  = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.zw  = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.zw : data_1.zw;

-        data_2.x   = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.x   = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.x : data_2.x;
-        data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) 
+        data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
                     ? tmp_data_2.yzw : data_2.yzw;

        *((__global uchar4 *)(dst + dst_index + 0)) = data_0;
@@ -611,8 +611,8 @@ __kernel void arithm_bitwise_or_with_mask_C3_D1 (__global char *src1, int src1_s
        x = x << 2;

        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
-        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); 
-        int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3)); 
+        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
+        int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -638,17 +638,17 @@ __kernel void arithm_bitwise_or_with_mask_C3_D1 (__global char *src1, int src1_s
        char4 tmp_data_2 =  src1_data_2 | src2_data_2;

        data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
-        data_0.w   = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+        data_0.w   = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
                     ? tmp_data_0.w : data_0.w;

-        data_1.xy  = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+        data_1.xy  = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
                     ? tmp_data_1.xy : data_1.xy;
-        data_1.zw  = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.zw  = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.zw : data_1.zw;

-        data_2.x   = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.x   = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.x : data_2.x;
-        data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) 
+        data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
                     ? tmp_data_2.yzw : data_2.yzw;

        *((__global char4 *)(dst + dst_index + 0)) = data_0;
@@ -672,8 +672,8 @@ __kernel void arithm_bitwise_or_with_mask_C3_D2 (__global ushort *src1, int src1
        x = x << 1;

        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
-        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); 
-        int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6)); 
+        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
+        int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -700,12 +700,12 @@ __kernel void arithm_bitwise_or_with_mask_C3_D2 (__global ushort *src1, int src1

        data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

-        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) 
+        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
                     ? tmp_data_1.x : data_1.x;
-        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.y : data_1.y;

-        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.xy : data_2.xy;

       *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -728,8 +728,8 @@ __kernel void arithm_bitwise_or_with_mask_C3_D3 (__global short *src1, int src1_
        x = x << 1;

        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
-        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); 
-        int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6)); 
+        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
+        int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -756,12 +756,12 @@ __kernel void arithm_bitwise_or_with_mask_C3_D3 (__global short *src1, int src1_

        data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

-        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) 
+        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
                     ? tmp_data_1.x : data_1.x;
-        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.y : data_1.y;

-        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.xy : data_2.xy;

       *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -781,8 +781,8 @@ __kernel void arithm_bitwise_or_with_mask_C3_D4 (__global int   *src1, int src1_

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); 
-        int src2_index = mad24(y, src2_step, (x * 12) + src2_offset); 
+        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
+        int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
        int mask_index = mad24(y, mask_step, x + mask_offset);
        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));

@@ -825,15 +825,15 @@ __kernel void arithm_bitwise_or_with_mask_C3_D5 (__global char *src1, int src1_s

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); 
-        int src2_index = mad24(y, src2_step, (x * 12) + src2_offset); 
+        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
+        int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
        int mask_index = mad24(y, mask_step, x + mask_offset);
        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));

        char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0));
        char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4));
        char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8));
-                                             
+
        char4 src2_data_0 = *((__global char4 *)((__global char *)src2 + src2_index + 0));
        char4 src2_data_1 = *((__global char4 *)((__global char *)src2 + src2_index + 4));
        char4 src2_data_2 = *((__global char4 *)((__global char *)src2 + src2_index + 8));
@@ -870,15 +870,15 @@ __kernel void arithm_bitwise_or_with_mask_C3_D6 (__global char *src1, int src1_s

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); 
-        int src2_index = mad24(y, src2_step, (x * 24) + src2_offset); 
+        int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
+        int src2_index = mad24(y, src2_step, (x * 24) + src2_offset);
        int mask_index = mad24(y, mask_step, x + mask_offset);
        int dst_index  = mad24(y, dst_step, dst_offset + (x * 24));

        char8 src1_data_0 = *((__global char8 *)((__global char *)src1 + src1_index + 0 ));
        char8 src1_data_1 = *((__global char8 *)((__global char *)src1 + src1_index + 8 ));
        char8 src1_data_2 = *((__global char8 *)((__global char *)src1 + src1_index + 16));
-                                               
+
        char8 src2_data_0 = *((__global char8 *)((__global char *)src2 + src2_index + 0 ));
        char8 src2_data_1 = *((__global char8 *)((__global char *)src2 + src2_index + 8 ));
        char8 src2_data_2 = *((__global char8 *)((__global char *)src2 + src2_index + 16));
@@ -930,7 +930,7 @@ __kernel void arithm_bitwise_or_with_mask_C4_D0 (__global uchar *src1, int src1_
        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));

        uchar4 data = src_data1 | src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global uchar4 *)(dst + dst_index)) = data;
    }
@@ -961,7 +961,7 @@ __kernel void arithm_bitwise_or_with_mask_C4_D1 (__global char *src1, int src1_s
        char4 dst_data  = *((__global char4 *)(dst  + dst_index));

        char4 data = src_data1 | src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global char4 *)(dst + dst_index)) = data;
    }
@@ -991,7 +991,7 @@ __kernel void arithm_bitwise_or_with_mask_C4_D2 (__global ushort *src1, int src1
        ushort4 dst_data  = *((__global ushort4 *)((__global char *)dst  + dst_index));

        ushort4 data = src_data1 | src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
    }
@@ -1020,7 +1020,7 @@ __kernel void arithm_bitwise_or_with_mask_C4_D3 (__global short *src1, int src1_
        short4 dst_data  = *((__global short4 *)((__global char *)dst  + dst_index));

        short4 data = src_data1 | src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global short4 *)((__global char *)dst + dst_index)) = data;
    }
@@ -1049,7 +1049,7 @@ __kernel void arithm_bitwise_or_with_mask_C4_D4 (__global int   *src1, int src1_
        int4 dst_data  = *((__global int4 *)((__global char *)dst  + dst_index));

        int4 data = src_data1 | src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global int4 *)((__global char *)dst + dst_index)) = data;
    }
@@ -1078,7 +1078,7 @@ __kernel void arithm_bitwise_or_with_mask_C4_D5 (__global char *src1, int src1_s
        char16 dst_data  = *((__global char16 *)((__global char *)dst  + dst_index));

        char16 data = src_data1 | src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global char16 *)((__global char *)dst + dst_index)) = data;
    }
@@ -1123,10 +1123,10 @@ __kernel void arithm_bitwise_or_with_mask_C4_D6 (__global char *src1, int src1_s
        char8 data_2 = src_data1_2 | src_data2_2;
        char8 data_3 = src_data1_3 | src_data2_3;

-        data_0 = mask_data ? data_0 : dst_data_0; 
-        data_1 = mask_data ? data_1 : dst_data_1; 
-        data_2 = mask_data ? data_2 : dst_data_2; 
-        data_3 = mask_data ? data_3 : dst_data_3; 
+        data_0 = mask_data ? data_0 : dst_data_0;
+        data_1 = mask_data ? data_1 : dst_data_1;
+        data_2 = mask_data ? data_2 : dst_data_2;
+        data_3 = mask_data ? data_3 : dst_data_3;

        *((__global char8 *)((__global char *)dst + dst_index + 0)) = data_0;
        *((__global char8 *)((__global char *)dst + dst_index + 8)) = data_1;
--- a/modules/ocl/src/kernels/arithm_bitwise_or_scalar.cl
+++ b/modules/ocl/src/kernels/arithm_bitwise_or_scalar.cl
@@ -62,7 +62,7 @@ __kernel void arithm_s_bitwise_or_C1_D0 (__global   uchar *src1, int src1_step,
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -96,7 +96,7 @@ __kernel void arithm_s_bitwise_or_C1_D1 (__global   char *src1, int src1_step, i
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -130,7 +130,7 @@ __kernel void arithm_s_bitwise_or_C1_D2 (__global   ushort *src1, int src1_step,
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -161,7 +161,7 @@ __kernel void arithm_s_bitwise_or_C1_D3 (__global   short *src1, int src1_step,
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -225,7 +225,7 @@ __kernel void arithm_s_bitwise_or_C1_D5 (__global   char *src1, int src1_step, i
 __kernel void arithm_s_bitwise_or_C1_D6 (__global short *src1, int src1_step, int src1_offset,
                                  __global short *dst,  int dst_step,  int dst_offset,
                                  short16 src2, int rows, int cols, int dst_step1)
-                                  
+
 {

    int x = get_global_id(0);
@@ -249,7 +249,7 @@ __kernel void arithm_s_bitwise_or_C1_D6 (__global short *src1, int src1_step, in
 __kernel void arithm_s_bitwise_or_C2_D0 (__global   uchar *src1, int src1_step, int src1_offset,
                                  __global   uchar *dst,  int dst_step,  int dst_offset,
                                  uchar4 src2, int rows, int cols, int dst_step1)
-                                 
+
 {

    int x = get_global_id(0);
@@ -260,7 +260,7 @@ __kernel void arithm_s_bitwise_or_C2_D0 (__global   uchar *src1, int src1_step,
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -283,7 +283,7 @@ __kernel void arithm_s_bitwise_or_C2_D0 (__global   uchar *src1, int src1_step,
 __kernel void arithm_s_bitwise_or_C2_D1 (__global   char *src1, int src1_step, int src1_offset,
                                  __global   char *dst,  int dst_step,  int dst_offset,
                                  char4 src2, int rows, int cols, int dst_step1)
-                                  
+
 {

    int x = get_global_id(0);
@@ -294,7 +294,7 @@ __kernel void arithm_s_bitwise_or_C2_D1 (__global   char *src1, int src1_step, i
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -316,7 +316,7 @@ __kernel void arithm_s_bitwise_or_C2_D1 (__global   char *src1, int src1_step, i
 __kernel void arithm_s_bitwise_or_C2_D2 (__global   ushort *src1, int src1_step, int src1_offset,
                                  __global   ushort *dst,  int dst_step,  int dst_offset,
                                  ushort4 src2, int rows, int cols, int dst_step1)
-                                  
+
 {

    int x = get_global_id(0);
@@ -338,7 +338,7 @@ __kernel void arithm_s_bitwise_or_C2_D2 (__global   ushort *src1, int src1_step,
 __kernel void arithm_s_bitwise_or_C2_D3 (__global   short *src1, int src1_step, int src1_offset,
                                  __global   short *dst,  int dst_step,  int dst_offset,
                                  short4 src2, int rows, int cols, int dst_step1)
-                                  
+
 {

    int x = get_global_id(0);
@@ -360,7 +360,7 @@ __kernel void arithm_s_bitwise_or_C2_D3 (__global   short *src1, int src1_step,
 __kernel void arithm_s_bitwise_or_C2_D4 (__global   int *src1, int src1_step, int src1_offset,
                                  __global   int *dst,  int dst_step,  int dst_offset,
                                  int4 src2, int rows, int cols, int dst_step1)
-                                  
+
 {

    int x = get_global_id(0);
@@ -381,7 +381,7 @@ __kernel void arithm_s_bitwise_or_C2_D4 (__global   int *src1, int src1_step, in
 __kernel void arithm_s_bitwise_or_C2_D5 (__global   char *src1, int src1_step, int src1_offset,
                                  __global   char *dst,  int dst_step,  int dst_offset,
                                  char16 src2, int rows, int cols, int dst_step1)
-                                  
+
 {

    int x = get_global_id(0);
@@ -403,7 +403,7 @@ __kernel void arithm_s_bitwise_or_C2_D5 (__global   char *src1, int src1_step, i
 __kernel void arithm_s_bitwise_or_C2_D6 (__global short *src1, int src1_step, int src1_offset,
                                  __global short *dst,  int dst_step,  int dst_offset,
                                  short16 src2, int rows, int cols, int dst_step1)
-                                  
+
 {

    int x = get_global_id(0);
@@ -426,7 +426,7 @@ __kernel void arithm_s_bitwise_or_C2_D6 (__global short *src1, int src1_step, in
 __kernel void arithm_s_bitwise_or_C3_D0 (__global   uchar *src1, int src1_step, int src1_offset,
                                  __global   uchar *dst,  int dst_step,  int dst_offset,
                                  uchar4 src2, int rows, int cols, int dst_step1)
-                                  
+
 {

    int x = get_global_id(0);
@@ -437,7 +437,7 @@ __kernel void arithm_s_bitwise_or_C3_D0 (__global   uchar *src1, int src1_step,
        x = x << 2;

        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
-        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); 
+        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -447,9 +447,9 @@ __kernel void arithm_s_bitwise_or_C3_D0 (__global   uchar *src1, int src1_step,
        uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
        uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);

-        uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x); 
+        uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x);
        uchar4 src2_data_1 = (uchar4)(src2.y, src2.z, src2.x, src2.y);
-        uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z); 
+        uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z);

        uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
        uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
@@ -460,17 +460,17 @@ __kernel void arithm_s_bitwise_or_C3_D0 (__global   uchar *src1, int src1_step,
        uchar4 tmp_data_2 =  src1_data_2  |  src2_data_2  ;

        data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
-        data_0.w   = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+        data_0.w   = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
                     ? tmp_data_0.w : data_0.w;

-        data_1.xy  = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+        data_1.xy  = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
                     ? tmp_data_1.xy : data_1.xy;
-        data_1.zw  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.zw  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.zw : data_1.zw;

-        data_2.x   = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.x   = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.x : data_2.x;
-        data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) 
+        data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
                     ? tmp_data_2.yzw : data_2.yzw;

        *((__global uchar4 *)(dst + dst_index + 0)) = data_0;
@@ -483,7 +483,7 @@ __kernel void arithm_s_bitwise_or_C3_D0 (__global   uchar *src1, int src1_step,
 __kernel void arithm_s_bitwise_or_C3_D1 (__global   char *src1, int src1_step, int src1_offset,
                                  __global   char *dst,  int dst_step,  int dst_offset,
                                  char4 src2, int rows, int cols, int dst_step1)
-                                  
+
 {

    int x = get_global_id(0);
@@ -494,7 +494,7 @@ __kernel void arithm_s_bitwise_or_C3_D1 (__global   char *src1, int src1_step, i
        x = x << 2;

        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
-        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); 
+        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -504,9 +504,9 @@ __kernel void arithm_s_bitwise_or_C3_D1 (__global   char *src1, int src1_step, i
        char4 src1_data_1 = vload4(0, src1 + src1_index + 4);
        char4 src1_data_2 = vload4(0, src1 + src1_index + 8);

-        char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x); 
+        char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x);
        char4 src2_data_1 = (char4)(src2.y, src2.z, src2.x, src2.y);
-        char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z); 
+        char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z);

        char4 data_0 = *((__global char4 *)(dst + dst_index + 0));
        char4 data_1 = *((__global char4 *)(dst + dst_index + 4));
@@ -517,17 +517,17 @@ __kernel void arithm_s_bitwise_or_C3_D1 (__global   char *src1, int src1_step, i
        char4 tmp_data_2 =  src1_data_2  |  src2_data_2;

        data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
-        data_0.w   = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+        data_0.w   = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
                     ? tmp_data_0.w : data_0.w;

-        data_1.xy  = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+        data_1.xy  = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
                     ? tmp_data_1.xy : data_1.xy;
-        data_1.zw  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.zw  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.zw : data_1.zw;

-        data_2.x   = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.x   = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.x : data_2.x;
-        data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) 
+        data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
                     ? tmp_data_2.yzw : data_2.yzw;

        *((__global char4 *)(dst + dst_index + 0)) = data_0;
@@ -539,7 +539,7 @@ __kernel void arithm_s_bitwise_or_C3_D1 (__global   char *src1, int src1_step, i
 __kernel void arithm_s_bitwise_or_C3_D2 (__global   ushort *src1, int src1_step, int src1_offset,
                                  __global   ushort *dst,  int dst_step,  int dst_offset,
                                  ushort4 src2, int rows, int cols, int dst_step1)
-                                 
+
 {

    int x = get_global_id(0);
@@ -550,7 +550,7 @@ __kernel void arithm_s_bitwise_or_C3_D2 (__global   ushort *src1, int src1_step,
        x = x << 1;

        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
-        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); 
+        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -574,12 +574,12 @@ __kernel void arithm_s_bitwise_or_C3_D2 (__global   ushort *src1, int src1_step,

        data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

-        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) 
+        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
                     ? tmp_data_1.x : data_1.x;
-        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.y : data_1.y;

-        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.xy : data_2.xy;

       *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -590,7 +590,7 @@ __kernel void arithm_s_bitwise_or_C3_D2 (__global   ushort *src1, int src1_step,
 __kernel void arithm_s_bitwise_or_C3_D3 (__global   short *src1, int src1_step, int src1_offset,
                                  __global   short *dst,  int dst_step,  int dst_offset,
                                  short4 src2, int rows, int cols, int dst_step1)
-                                  
+
 {

    int x = get_global_id(0);
@@ -601,7 +601,7 @@ __kernel void arithm_s_bitwise_or_C3_D3 (__global   short *src1, int src1_step,
        x = x << 1;

        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
-        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); 
+        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -625,12 +625,12 @@ __kernel void arithm_s_bitwise_or_C3_D3 (__global   short *src1, int src1_step,

        data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

-        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) 
+        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
                     ? tmp_data_1.x : data_1.x;
-        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.y : data_1.y;

-        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.xy : data_2.xy;

       *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -641,7 +641,7 @@ __kernel void arithm_s_bitwise_or_C3_D3 (__global   short *src1, int src1_step,
 __kernel void arithm_s_bitwise_or_C3_D4 (__global   int *src1, int src1_step, int src1_offset,
                                  __global   int *dst,  int dst_step,  int dst_offset,
                                  int4 src2, int rows, int cols, int dst_step1)
-                                  
+
 {

    int x = get_global_id(0);
@@ -649,7 +649,7 @@ __kernel void arithm_s_bitwise_or_C3_D4 (__global   int *src1, int src1_step, in

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); 
+        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));

        int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
@@ -676,7 +676,7 @@ __kernel void arithm_s_bitwise_or_C3_D4 (__global   int *src1, int src1_step, in
 __kernel void arithm_s_bitwise_or_C3_D5 (__global   char *src1, int src1_step, int src1_offset,
                                  __global   char *dst,  int dst_step,  int dst_offset,
                                  char16 src2, int rows, int cols, int dst_step1)
-                                  
+
 {

    int x = get_global_id(0);
@@ -685,16 +685,16 @@ __kernel void arithm_s_bitwise_or_C3_D5 (__global   char *src1, int src1_step, i
    if (x < cols && y < rows)

    {
-        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); 
+        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));

        char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0));
        char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4));
        char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8));
-                                             
-        char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3); 
+
+        char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
        char4 src2_data_1 = (char4)(src2.s4, src2.s5, src2.s6, src2.s7);
-        char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB); 
+        char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB);

        char4 tmp_data_0 = src1_data_0 | src2_data_0;
        char4 tmp_data_1 = src1_data_1 | src2_data_1;
@@ -709,7 +709,7 @@ __kernel void arithm_s_bitwise_or_C3_D5 (__global   char *src1, int src1_step, i
 __kernel void arithm_s_bitwise_or_C3_D6 (__global short *src1, int src1_step, int src1_offset,
                                          __global short *dst,  int dst_step,  int dst_offset,
                                          short16 src2, int rows, int cols, int dst_step1)
-                                  
+
 {

    int x = get_global_id(0);
@@ -717,13 +717,13 @@ __kernel void arithm_s_bitwise_or_C3_D6 (__global short *src1, int src1_step, in

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); 
+        int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
        int dst_index  = mad24(y, dst_step, dst_offset + (x * 24));

        short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0 ));
        short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8 ));
        short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16));
-                                               
+
        short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
        short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
        short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
@@ -735,7 +735,7 @@ __kernel void arithm_s_bitwise_or_C3_D6 (__global short *src1, int src1_step, in
        short4 tmp_data_0 = src1_data_0 | src2_data_0;
        short4 tmp_data_1 = src1_data_1 | src2_data_1;
        short4 tmp_data_2 = src1_data_2 | src2_data_2;
-        
+
       *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
       *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
       *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
@@ -745,7 +745,7 @@ __kernel void arithm_s_bitwise_or_C3_D6 (__global short *src1, int src1_step, in
 __kernel void arithm_s_bitwise_or_C4_D0 (__global   uchar *src1, int src1_step, int src1_offset,
                                  __global   uchar *dst,  int dst_step,  int dst_offset,
                                  uchar4 src2, int rows, int cols, int dst_step1)
-                                  
+
 {

    int x = get_global_id(0);
@@ -768,7 +768,7 @@ __kernel void arithm_s_bitwise_or_C4_D0 (__global   uchar *src1, int src1_step,
 __kernel void arithm_s_bitwise_or_C4_D1 (__global   char *src1, int src1_step, int src1_offset,
                                  __global   char *dst,  int dst_step,  int dst_offset,
                                  char4 src2, int rows, int cols, int dst_step1)
-                                  
+
 {

    int x = get_global_id(0);
@@ -790,7 +790,7 @@ __kernel void arithm_s_bitwise_or_C4_D1 (__global   char *src1, int src1_step, i
 __kernel void arithm_s_bitwise_or_C4_D2 (__global   ushort *src1, int src1_step, int src1_offset,
                                  __global   ushort *dst,  int dst_step,  int dst_offset,
                                  ushort4 src2, int rows, int cols, int dst_step1)
-                                  
+
 {

    int x = get_global_id(0);
@@ -811,7 +811,7 @@ __kernel void arithm_s_bitwise_or_C4_D2 (__global   ushort *src1, int src1_step,
 __kernel void arithm_s_bitwise_or_C4_D3 (__global   short *src1, int src1_step, int src1_offset,
                                  __global   short *dst,  int dst_step,  int dst_offset,
                                  short4 src2, int rows, int cols, int dst_step1)
-                                  
+
 {

    int x = get_global_id(0);
@@ -832,7 +832,7 @@ __kernel void arithm_s_bitwise_or_C4_D3 (__global   short *src1, int src1_step,
 __kernel void arithm_s_bitwise_or_C4_D4 (__global   int *src1, int src1_step, int src1_offset,
                                  __global   int *dst,  int dst_step,  int dst_offset,
                                  int4 src2, int rows, int cols, int dst_step1)
-                                  
+
 {

    int x = get_global_id(0);
@@ -853,7 +853,7 @@ __kernel void arithm_s_bitwise_or_C4_D4 (__global   int *src1, int src1_step, in
 __kernel void arithm_s_bitwise_or_C4_D5 (__global   char *src1, int src1_step, int src1_offset,
                                  __global   char *dst,  int dst_step,  int dst_offset,
                                  char16 src2, int rows, int cols, int dst_step1)
-                                  
+
 {

    int x = get_global_id(0);
@@ -877,7 +877,7 @@ __kernel void arithm_s_bitwise_or_C4_D5 (__global   char *src1, int src1_step, i
 __kernel void arithm_s_bitwise_or_C4_D6 (__global short *src1, int src1_step, int src1_offset,
                                          __global short *dst,  int dst_step,  int dst_offset,
                                          short16 src2, int rows, int cols, int dst_step1)
-                                  
+
 {

    int x = get_global_id(0);
@@ -897,17 +897,17 @@ __kernel void arithm_s_bitwise_or_C4_D6 (__global short *src1, int src1_step, in
        short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
        short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
        short4 src2_data_3 = (short4)(src2.sc, src2.sd, src2.se, src2.sf);
-        
+
        short4 tmp_data_0 = src1_data_0 | src2_data_0;
        short4 tmp_data_1 = src1_data_1 | src2_data_1;
        short4 tmp_data_2 = src1_data_2 | src2_data_2;
        short4 tmp_data_3 = src1_data_3 | src2_data_3;
-        
+
       *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
       *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
       *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
       *((__global short4 *)((__global char *)dst + dst_index + 24))= tmp_data_3;
-       
+
    }
 }
 #endif
--- a/modules/ocl/src/kernels/arithm_bitwise_or_scalar_mask.cl
+++ b/modules/ocl/src/kernels/arithm_bitwise_or_scalar_mask.cl
@@ -54,7 +54,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D0 (__global   uchar *src1, int s
                                            __global   uchar *dst,  int dst_step,  int dst_offset,
                                            __global   uchar *mask, int mask_step, int mask_offset,
                                            uchar4 src2, int rows, int cols, int dst_step1)
-                                            
+
 {

    int x = get_global_id(0);
@@ -65,7 +65,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D0 (__global   uchar *src1, int s
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -93,7 +93,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D1 (__global   char *src1, int sr
                                            __global   char *dst,  int dst_step,  int dst_offset,
                                            __global   uchar *mask, int mask_step, int mask_offset,
                                            char4 src2, int rows, int cols, int dst_step1)
-                                            
+
 {

    int x = get_global_id(0);
@@ -104,7 +104,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D1 (__global   char *src1, int sr
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -131,7 +131,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D2 (__global   ushort *src1, int
                                            __global   ushort *dst,  int dst_step,  int dst_offset,
                                            __global   uchar  *mask, int mask_step, int mask_offset,
                                            ushort4 src2, int rows, int cols, int dst_step1)
-                                            
+
 {

    int x = get_global_id(0);
@@ -142,7 +142,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D2 (__global   ushort *src1, int
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -166,7 +166,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D3 (__global   short *src1, int s
                                            __global   short *dst,  int dst_step,  int dst_offset,
                                            __global   uchar *mask, int mask_step, int mask_offset,
                                            short4 src2, int rows, int cols, int dst_step1)
-                                            
+
 {

    int x = get_global_id(0);
@@ -177,7 +177,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D3 (__global   short *src1, int s
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -201,7 +201,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D4 (__global   int   *src1, int s
                                            __global   int   *dst,  int dst_step,  int dst_offset,
                                            __global   uchar *mask, int mask_step, int mask_offset,
                                            int4 src2, int rows, int cols, int dst_step1)
-                                            
+
 {

    int x = get_global_id(0);
@@ -220,7 +220,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D4 (__global   int   *src1, int s
        int dst_data  = *((__global int *)((__global char *)dst  + dst_index));

        int data = src_data1 | src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global int *)((__global char *)dst + dst_index)) = data;
    }
@@ -230,7 +230,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D5 (__global   char   *src1, int
                                            __global   char   *dst,  int dst_step,  int dst_offset,
                                            __global   uchar *mask, int mask_step, int mask_offset,
                                            char16 src2, int rows, int cols, int dst_step1)
-                                            
+
 {

    int x = get_global_id(0);
@@ -249,7 +249,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D5 (__global   char   *src1, int
        char4 dst_data  = *((__global char4 *)((__global char *)dst  + dst_index));

        char4 data = src_data1 | src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global char4 *)((__global char *)dst + dst_index)) = data;
    }
@@ -260,7 +260,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D6 (__global short *src1, int src
                                            __global short *dst,  int dst_step,  int dst_offset,
                                            __global   uchar *mask, int mask_step, int mask_offset,
                                            short16 src2, int rows, int cols, int dst_step1)
-                                            
+
 {

    int x = get_global_id(0);
@@ -279,7 +279,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D6 (__global short *src1, int src
        short4 dst_data  = *((__global short4 *)((__global char *)dst  + dst_index));

        short4 data = src1_data | src2_data;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global short4 *)((__global char *)dst + dst_index)) = data;
    }
@@ -289,7 +289,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D0 (__global   uchar *src1, int s
                                            __global   uchar *dst,  int dst_step,  int dst_offset,
                                            __global   uchar *mask, int mask_step, int mask_offset,
                                            uchar4 src2, int rows, int cols, int dst_step1)
-                                           
+
 {

    int x = get_global_id(0);
@@ -300,7 +300,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D0 (__global   uchar *src1, int s
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -326,7 +326,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D1 (__global   char *src1, int sr
                                            __global   char *dst,  int dst_step,  int dst_offset,
                                            __global   uchar *mask, int mask_step, int mask_offset,
                                            char4 src2, int rows, int cols, int dst_step1)
-                                            
+
 {

    int x = get_global_id(0);
@@ -337,7 +337,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D1 (__global   char *src1, int sr
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -362,7 +362,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D2 (__global   ushort *src1, int
                                            __global   ushort *dst,  int dst_step,  int dst_offset,
                                            __global   uchar *mask, int mask_step, int mask_offset,
                                            ushort4 src2, int rows, int cols, int dst_step1)
-                                            
+
 {

    int x = get_global_id(0);
@@ -381,7 +381,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D2 (__global   ushort *src1, int
        ushort2 dst_data  = *((__global ushort2 *)((__global char *)dst  + dst_index));

        ushort2 data = src_data1 | src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
    }
@@ -390,7 +390,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D3 (__global   short *src1, int s
                                            __global   short *dst,  int dst_step,  int dst_offset,
                                            __global   uchar *mask, int mask_step, int mask_offset,
                                            short4 src2, int rows, int cols, int dst_step1)
-                                            
+
 {

    int x = get_global_id(0);
@@ -409,7 +409,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D3 (__global   short *src1, int s
        short2 dst_data  = *((__global short2 *)((__global char *)dst  + dst_index));

        short2 data = src_data1 | src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global short2 *)((__global char *)dst + dst_index)) = data;
    }
@@ -418,7 +418,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D4 (__global   int *src1, int src
                                            __global   int *dst,  int dst_step,  int dst_offset,
                                            __global   uchar *mask, int mask_step, int mask_offset,
                                            int4 src2, int rows, int cols, int dst_step1)
-                                            
+
 {

    int x = get_global_id(0);
@@ -437,7 +437,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D4 (__global   int *src1, int src
        int2 dst_data  = *((__global int2 *)((__global char *)dst  + dst_index));

        int2 data = src_data1 | src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global int2 *)((__global char *)dst + dst_index)) = data;
    }
@@ -446,7 +446,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D5 (__global   char *src1, int sr
                                            __global   char *dst,  int dst_step,  int dst_offset,
                                            __global   uchar *mask, int mask_step, int mask_offset,
                                            char16 src2, int rows, int cols, int dst_step1)
-                                            
+
 {

    int x = get_global_id(0);
@@ -463,8 +463,8 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D5 (__global   char *src1, int sr
        char8 src_data1 = *((__global char8 *)((__global char *)src1 + src1_index));
        char8 src_data2 = (char8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);
        char8 dst_data = *((__global char8 *)((__global char *)dst  + dst_index));
-	      char8 data = src_data1 | src_data2;
-        data = mask_data ? data : dst_data; 
+          char8 data = src_data1 | src_data2;
+        data = mask_data ? data : dst_data;
        *((__global char8 *)((__global char *)dst + dst_index)) = data;

      }
@@ -474,7 +474,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D6 (__global   char *src1, int sr
                                            __global   char *dst,  int dst_step,  int dst_offset,
                                            __global   uchar *mask, int mask_step, int mask_offset,
                                            short16 src2, int rows, int cols, int dst_step1)
-                                            
+
 {

    int x = get_global_id(0);
@@ -493,7 +493,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D6 (__global   char *src1, int sr
        short8 dst_data = *((__global short8 *)((__global char *)dst  + dst_index));

        short8 data = src1_data | src2_data;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global short8 *)((__global char *)dst + dst_index)) = data;
    }
@@ -503,7 +503,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D0 (__global   uchar *src1, int s
                                            __global   uchar *dst,  int dst_step,  int dst_offset,
                                            __global   uchar *mask, int mask_step, int mask_offset,
                                            uchar4 src2, int rows, int cols, int dst_step1)
-                                            
+
 {

    int x = get_global_id(0);
@@ -514,7 +514,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D0 (__global   uchar *src1, int s
        x = x << 2;

        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
-        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); 
+        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -525,9 +525,9 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D0 (__global   uchar *src1, int s
        uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
        uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);

-        uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x); 
+        uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x);
        uchar4 src2_data_1 = (uchar4)(src2.y, src2.z, src2.x, src2.y);
-        uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z); 
+        uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z);

        uchar4 mask_data = vload4(0, mask + mask_index);

@@ -540,17 +540,17 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D0 (__global   uchar *src1, int s
        uchar4 tmp_data_2 = src1_data_2 | src2_data_2;

        data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
-        data_0.w   = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+        data_0.w   = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
                     ? tmp_data_0.w : data_0.w;

-        data_1.xy  = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+        data_1.xy  = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
                     ? tmp_data_1.xy : data_1.xy;
-        data_1.zw  = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.zw  = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.zw : data_1.zw;

-        data_2.x   = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.x   = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.x : data_2.x;
-        data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) 
+        data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
                     ? tmp_data_2.yzw : data_2.yzw;

        *((__global uchar4 *)(dst + dst_index + 0)) = data_0;
@@ -564,7 +564,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D1 (__global   char *src1, int sr
                                            __global   char *dst,  int dst_step,  int dst_offset,
                                            __global   uchar *mask, int mask_step, int mask_offset,
                                            char4 src2, int rows, int cols, int dst_step1)
-                                            
+
 {

    int x = get_global_id(0);
@@ -575,7 +575,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D1 (__global   char *src1, int sr
        x = x << 2;

        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
-        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); 
+        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -586,9 +586,9 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D1 (__global   char *src1, int sr
        char4 src1_data_1 = vload4(0, src1 + src1_index + 4);
        char4 src1_data_2 = vload4(0, src1 + src1_index + 8);

-        char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x); 
+        char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x);
        char4 src2_data_1 = (char4)(src2.y, src2.z, src2.x, src2.y);
-        char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z); 
+        char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z);

        uchar4 mask_data = vload4(0, mask + mask_index);

@@ -601,17 +601,17 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D1 (__global   char *src1, int sr
        char4 tmp_data_2 = src1_data_2 | src2_data_2;

        data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
-        data_0.w   = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+        data_0.w   = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
                     ? tmp_data_0.w : data_0.w;

-        data_1.xy  = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+        data_1.xy  = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
                     ? tmp_data_1.xy : data_1.xy;
-        data_1.zw  = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.zw  = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.zw : data_1.zw;

-        data_2.x   = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.x   = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.x : data_2.x;
-        data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) 
+        data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
                     ? tmp_data_2.yzw : data_2.yzw;

        *((__global char4 *)(dst + dst_index + 0)) = data_0;
@@ -624,7 +624,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D2 (__global   ushort *src1, int
                                            __global   ushort *dst,  int dst_step,  int dst_offset,
                                            __global   uchar *mask, int mask_step, int mask_offset,
                                            ushort4 src2, int rows, int cols, int dst_step1)
-                                            
+
 {

    int x = get_global_id(0);
@@ -635,7 +635,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D2 (__global   ushort *src1, int
        x = x << 1;

        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
-        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); 
+        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -662,12 +662,12 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D2 (__global   ushort *src1, int

        data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

-        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) 
+        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
                     ? tmp_data_1.x : data_1.x;
-        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.y : data_1.y;

-        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.xy : data_2.xy;

       *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -679,7 +679,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D3 (__global   short *src1, int s
                                            __global   short *dst,  int dst_step,  int dst_offset,
                                            __global   uchar *mask, int mask_step, int mask_offset,
                                            short4 src2, int rows, int cols, int dst_step1)
-                                            
+
 {

    int x = get_global_id(0);
@@ -690,7 +690,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D3 (__global   short *src1, int s
        x = x << 1;

        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
-        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); 
+        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -717,12 +717,12 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D3 (__global   short *src1, int s

        data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

-        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) 
+        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
                     ? tmp_data_1.x : data_1.x;
-        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.y : data_1.y;

-        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.xy : data_2.xy;

       *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -734,7 +734,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D4 (__global   int *src1, int src
                                            __global   int *dst,  int dst_step,  int dst_offset,
                                            __global   uchar *mask, int mask_step, int mask_offset,
                                            int4 src2, int rows, int cols, int dst_step1)
-                                            
+
 {

    int x = get_global_id(0);
@@ -742,7 +742,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D4 (__global   int *src1, int src

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); 
+        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
        int mask_index = mad24(y, mask_step, x + mask_offset);
        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));

@@ -777,7 +777,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D5 (__global   char *src1, int sr
                                            __global   char *dst,  int dst_step,  int dst_offset,
                                            __global   uchar *mask, int mask_step, int mask_offset,
                                            char16 src2, int rows, int cols, int dst_step1)
-                                            
+
 {

    int x = get_global_id(0);
@@ -785,18 +785,18 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D5 (__global   char *src1, int sr

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); 
+        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
        int mask_index = mad24(y, mask_step, x + mask_offset);
        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));

        char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0));
        char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4));
        char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8));
-                                             
-        char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3); 
+
+        char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
        char4 src2_data_1 = (char4)(src2.s4, src2.s5, src2.s6, src2.s7);
-        char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB); 
-                
+        char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB);
+
        uchar mask_data = * (mask + mask_index);

        char4 data_0 = *((__global char4 *)((__global char *)dst + dst_index + 0));
@@ -829,18 +829,18 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D6 (__global short *src1, int src

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); 
+        int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
        int mask_index = mad24(y, mask_step, x + mask_offset);
        int dst_index  = mad24(y, dst_step, dst_offset + (x * 24));

        short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0 ));
        short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8 ));
        short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16));
-                                               
+
        short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
        short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
        short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
-        
+
        uchar mask_data = * (mask + mask_index);

        short4 data_0 = *((__global short4 *)((__global char *)dst + dst_index + 0 ));
@@ -850,7 +850,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D6 (__global short *src1, int src
        short4 tmp_data_0 = src1_data_0 | src2_data_0;
        short4 tmp_data_1 = src1_data_1 | src2_data_1;
        short4 tmp_data_2 = src1_data_2 | src2_data_2;
-        
+
        data_0 = mask_data ? tmp_data_0 : data_0;
        data_1 = mask_data ? tmp_data_1 : data_1;
        data_2 = mask_data ? tmp_data_2 : data_2;
@@ -865,7 +865,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D0 (__global   uchar *src1, int s
                                            __global   uchar *dst,  int dst_step,  int dst_offset,
                                            __global   uchar *mask, int mask_step, int mask_offset,
                                            uchar4 src2, int rows, int cols, int dst_step1)
-                                            
+
 {

    int x = get_global_id(0);
@@ -883,7 +883,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D0 (__global   uchar *src1, int s
        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));

        uchar4 data = src_data1 | src2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global uchar4 *)(dst + dst_index)) = data;
    }
@@ -894,7 +894,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D1 (__global   char *src1, int sr
                                            __global   char *dst,  int dst_step,  int dst_offset,
                                            __global   uchar *mask, int mask_step, int mask_offset,
                                            char4 src2, int rows, int cols, int dst_step1)
-                                            
+
 {

    int x = get_global_id(0);
@@ -912,7 +912,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D1 (__global   char *src1, int sr
        char4 dst_data  = *((__global char4 *)(dst  + dst_index));

        char4 data = src_data1 | src2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global char4 *)(dst + dst_index)) = data;
    }
@@ -922,7 +922,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D2 (__global   ushort *src1, int
                                            __global   ushort *dst,  int dst_step,  int dst_offset,
                                            __global   uchar *mask, int mask_step, int mask_offset,
                                            ushort4 src2, int rows, int cols, int dst_step1)
-                                            
+
 {

    int x = get_global_id(0);
@@ -940,7 +940,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D2 (__global   ushort *src1, int
        ushort4 dst_data  = *((__global ushort4 *)((__global char *)dst  + dst_index));

        ushort4 data = src_data1 | src2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
    }
@@ -949,7 +949,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D3 (__global   short *src1, int s
                                            __global   short *dst,  int dst_step,  int dst_offset,
                                            __global   uchar *mask, int mask_step, int mask_offset,
                                            short4 src2, int rows, int cols, int dst_step1)
-                                            
+
 {

    int x = get_global_id(0);
@@ -967,7 +967,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D3 (__global   short *src1, int s
        short4 dst_data  = *((__global short4 *)((__global char *)dst  + dst_index));

        short4 data = src_data1 | src2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global short4 *)((__global char *)dst + dst_index)) = data;
    }
@@ -976,7 +976,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D4 (__global   int *src1, int src
                                            __global   int *dst,  int dst_step,  int dst_offset,
                                            __global   uchar *mask, int mask_step, int mask_offset,
                                            int4 src2, int rows, int cols, int dst_step1)
-                                            
+
 {

    int x = get_global_id(0);
@@ -994,7 +994,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D4 (__global   int *src1, int src
        int4 dst_data  = *((__global int4 *)((__global char *)dst  + dst_index));

        int4 data = src_data1 | src2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global int4 *)((__global char *)dst + dst_index)) = data;
    }
@@ -1003,7 +1003,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D5 (__global   char *src1, int sr
                                            __global   char *dst,  int dst_step,  int dst_offset,
                                            __global   uchar *mask, int mask_step, int mask_offset,
                                            char16 src2, int rows, int cols, int dst_step1)
-                                            
+
 {

    int x = get_global_id(0);
@@ -1023,7 +1023,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D5 (__global   char *src1, int sr
        char16 dst_data  = *((__global char16 *)((__global char *)dst  + dst_index));

        char16 data = src_data1 | src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global char16 *)((__global char *)dst + dst_index)) = data;
    }
@@ -1055,7 +1055,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D6 (__global short *src1, int src
        short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
        short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
        short4 src2_data_3 = (short4)(src2.sc, src2.sd, src2.se, src2.sf);
-        
+
        short4 dst_data_0  = *((__global short4 *)((__global char *)dst  + dst_index + 0));
        short4 dst_data_1  = *((__global short4 *)((__global char *)dst  + dst_index + 8));
        short4 dst_data_2  = *((__global short4 *)((__global char *)dst  + dst_index + 16));
@@ -1065,10 +1065,10 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D6 (__global short *src1, int src
        short4 data_1 = src1_data_1 | src2_data_1;
        short4 data_2 = src1_data_2 | src2_data_2;
        short4 data_3 = src1_data_3 | src2_data_3;
-	      
-        data_0 = mask_data ? data_0 : dst_data_0; 
-        data_1 = mask_data ? data_1 : dst_data_1; 
-        data_2 = mask_data ? data_2 : dst_data_2; 
+
+        data_0 = mask_data ? data_0 : dst_data_0;
+        data_1 = mask_data ? data_1 : dst_data_1;
+        data_2 = mask_data ? data_2 : dst_data_2;
        data_3 = mask_data ? data_3 : dst_data_3;

        *((__global short4 *)((__global char *)dst + dst_index + 0)) = data_0;
--- a/modules/ocl/src/kernels/arithm_bitwise_xor.cl
+++ b/modules/ocl/src/kernels/arithm_bitwise_xor.cl
@@ -63,8 +63,8 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -99,8 +99,8 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -136,8 +136,8 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s
        x = x << 2;

        #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -174,8 +174,8 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr
        x = x << 2;

        #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
--- a/modules/ocl/src/kernels/arithm_bitwise_xor_mask.cl
+++ b/modules/ocl/src/kernels/arithm_bitwise_xor_mask.cl
@@ -65,8 +65,8 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D0 (__global uchar *src1, int src1
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -106,8 +106,8 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D1 (__global char *src1, int src1_
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -147,8 +147,8 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D2 (__global ushort *src1, int src
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -186,8 +186,8 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D3 (__global short *src1, int src1
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -198,8 +198,8 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D3 (__global short *src1, int src1
        short2 src2_data = vload2(0, (__global short *)((__global char *)src2 + src2_index));
        uchar2  mask_data = vload2(0, mask + mask_index);

-	short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
-	short2 tmp_data = src1_data ^ src2_data;
+    short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
+    short2 tmp_data = src1_data ^ src2_data;

        data.x = convert_short((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
        data.y = convert_short((mask_data.y) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : data.y;
@@ -234,7 +234,7 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D4 (__global int   *src1, int src1
        int dst_data  = *((__global int *)((__global char *)dst  + dst_index));

        int data = src_data1 ^ src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global int *)((__global char *)dst + dst_index)) = data;
    }
@@ -266,7 +266,7 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D5 (__global char *src1, int src1_
        char4 dst_data  = *((__global char4 *)((__global char *)dst  + dst_index));

        char4 data = src_data1 ^ src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global char4 *)((__global char *)dst + dst_index)) = data;
    }
@@ -299,7 +299,7 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D6 (__global char *src1, int src1_
        char8 dst_data  = *((__global char8 *)((__global char *)dst  + dst_index));

        char8 data = src_data1 ^ src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global char8 *)((__global char *)dst + dst_index)) = data;
    }
@@ -324,8 +324,8 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D0 (__global uchar *src1, int src1
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -362,8 +362,8 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D1 (__global char *src1, int src1_
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -408,7 +408,7 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D2 (__global ushort *src1, int src
        ushort2 dst_data  = *((__global ushort2 *)((__global char *)dst  + dst_index));

        ushort2 data = src_data1 ^ src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
    }
@@ -437,7 +437,7 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D3 (__global short *src1, int src1
        short2 dst_data  = *((__global short2 *)((__global char *)dst  + dst_index));

        short2 data = src_data1 ^ src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global short2 *)((__global char *)dst + dst_index)) = data;
    }
@@ -466,7 +466,7 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D4 (__global int   *src1, int src1
        int2 dst_data  = *((__global int2 *)((__global char *)dst  + dst_index));

        int2 data = src_data1 ^ src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global int2 *)((__global char *)dst + dst_index)) = data;
    }
@@ -495,7 +495,7 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D5 (__global char *src1, int src1_
        char8 dst_data  = *((__global char8 *)((__global char *)dst  + dst_index));

        char8 data = src_data1 ^ src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global char8 *)((__global char *)dst + dst_index)) = data;
    }
@@ -525,7 +525,7 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D6 (__global char *src1, int src1_
        char16 dst_data  = *((__global char16 *)((__global char *)dst  + dst_index));

        char16 data = src_data1 ^ src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global char16 *)((__global char *)dst + dst_index)) = data;
    }
@@ -549,8 +549,8 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D0 (__global uchar *src1, int src1
        x = x << 2;

        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
-        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); 
-        int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3)); 
+        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
+        int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -576,17 +576,17 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D0 (__global uchar *src1, int src1
        uchar4 tmp_data_2 =  src1_data_2 ^ src2_data_2;

        data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
-        data_0.w   = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+        data_0.w   = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
                     ? tmp_data_0.w : data_0.w;

-        data_1.xy  = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+        data_1.xy  = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
                     ? tmp_data_1.xy : data_1.xy;
-        data_1.zw  = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.zw  = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.zw : data_1.zw;

-        data_2.x   = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.x   = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.x : data_2.x;
-        data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) 
+        data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
                     ? tmp_data_2.yzw : data_2.yzw;

        *((__global uchar4 *)(dst + dst_index + 0)) = data_0;
@@ -611,8 +611,8 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D1 (__global char *src1, int src1_
        x = x << 2;

        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
-        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); 
-        int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3)); 
+        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
+        int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -638,17 +638,17 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D1 (__global char *src1, int src1_
        char4 tmp_data_2 =  src1_data_2 ^ src2_data_2;

        data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
-        data_0.w   = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+        data_0.w   = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
                     ? tmp_data_0.w : data_0.w;

-        data_1.xy  = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+        data_1.xy  = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
                     ? tmp_data_1.xy : data_1.xy;
-        data_1.zw  = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.zw  = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.zw : data_1.zw;

-        data_2.x   = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.x   = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.x : data_2.x;
-        data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) 
+        data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
                     ? tmp_data_2.yzw : data_2.yzw;

        *((__global char4 *)(dst + dst_index + 0)) = data_0;
@@ -672,8 +672,8 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D2 (__global ushort *src1, int src
        x = x << 1;

        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
-        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); 
-        int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6)); 
+        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
+        int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -700,12 +700,12 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D2 (__global ushort *src1, int src

        data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

-        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) 
+        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
                     ? tmp_data_1.x : data_1.x;
-        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.y : data_1.y;

-        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.xy : data_2.xy;

       *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -728,8 +728,8 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D3 (__global short *src1, int src1
        x = x << 1;

        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
-        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); 
-        int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6)); 
+        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
+        int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -756,12 +756,12 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D3 (__global short *src1, int src1

        data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

-        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) 
+        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
                     ? tmp_data_1.x : data_1.x;
-        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.y : data_1.y;

-        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.xy : data_2.xy;

       *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -781,8 +781,8 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D4 (__global int   *src1, int src1

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); 
-        int src2_index = mad24(y, src2_step, (x * 12) + src2_offset); 
+        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
+        int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
        int mask_index = mad24(y, mask_step, x + mask_offset);
        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));

@@ -825,15 +825,15 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D5 (__global char *src1, int src1_

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); 
-        int src2_index = mad24(y, src2_step, (x * 12) + src2_offset); 
+        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
+        int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
        int mask_index = mad24(y, mask_step, x + mask_offset);
        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));

        char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0));
        char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4));
        char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8));
-                                             
+
        char4 src2_data_0 = *((__global char4 *)((__global char *)src2 + src2_index + 0));
        char4 src2_data_1 = *((__global char4 *)((__global char *)src2 + src2_index + 4));
        char4 src2_data_2 = *((__global char4 *)((__global char *)src2 + src2_index + 8));
@@ -870,15 +870,15 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D6 (__global char *src1, int src1_

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); 
-        int src2_index = mad24(y, src2_step, (x * 24) + src2_offset); 
+        int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
+        int src2_index = mad24(y, src2_step, (x * 24) + src2_offset);
        int mask_index = mad24(y, mask_step, x + mask_offset);
        int dst_index  = mad24(y, dst_step, dst_offset + (x * 24));

        char8 src1_data_0 = *((__global char8 *)((__global char *)src1 + src1_index + 0 ));
        char8 src1_data_1 = *((__global char8 *)((__global char *)src1 + src1_index + 8 ));
        char8 src1_data_2 = *((__global char8 *)((__global char *)src1 + src1_index + 16));
-                                               
+
        char8 src2_data_0 = *((__global char8 *)((__global char *)src2 + src2_index + 0 ));
        char8 src2_data_1 = *((__global char8 *)((__global char *)src2 + src2_index + 8 ));
        char8 src2_data_2 = *((__global char8 *)((__global char *)src2 + src2_index + 16));
@@ -930,7 +930,7 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D0 (__global uchar *src1, int src1
        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));

        uchar4 data = src_data1 ^ src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global uchar4 *)(dst + dst_index)) = data;
    }
@@ -961,7 +961,7 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D1 (__global char *src1, int src1_
        char4 dst_data  = *((__global char4 *)(dst  + dst_index));

        char4 data = src_data1 ^ src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global char4 *)(dst + dst_index)) = data;
    }
@@ -991,7 +991,7 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D2 (__global ushort *src1, int src
        ushort4 dst_data  = *((__global ushort4 *)((__global char *)dst  + dst_index));

        ushort4 data = src_data1 ^ src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
    }
@@ -1020,7 +1020,7 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D3 (__global short *src1, int src1
        short4 dst_data  = *((__global short4 *)((__global char *)dst  + dst_index));

        short4 data = src_data1 ^ src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global short4 *)((__global char *)dst + dst_index)) = data;
    }
@@ -1049,7 +1049,7 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D4 (__global int   *src1, int src1
        int4 dst_data  = *((__global int4 *)((__global char *)dst  + dst_index));

        int4 data = src_data1 ^ src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global int4 *)((__global char *)dst + dst_index)) = data;
    }
@@ -1078,7 +1078,7 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D5 (__global char *src1, int src1_
        char16 dst_data  = *((__global char16 *)((__global char *)dst  + dst_index));

        char16 data = src_data1 ^ src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global char16 *)((__global char *)dst + dst_index)) = data;
    }
@@ -1123,10 +1123,10 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D6 (__global char *src1, int src1_
        char8 data_2 = src_data1_2 ^ src_data2_2;
        char8 data_3 = src_data1_3 ^ src_data2_3;

-        data_0 = mask_data ? data_0 : dst_data_0; 
-        data_1 = mask_data ? data_1 : dst_data_1; 
-        data_2 = mask_data ? data_2 : dst_data_2; 
-        data_3 = mask_data ? data_3 : dst_data_3; 
+        data_0 = mask_data ? data_0 : dst_data_0;
+        data_1 = mask_data ? data_1 : dst_data_1;
+        data_2 = mask_data ? data_2 : dst_data_2;
+        data_3 = mask_data ? data_3 : dst_data_3;

        *((__global char8 *)((__global char *)dst + dst_index + 0)) = data_0;
        *((__global char8 *)((__global char *)dst + dst_index + 8)) = data_1;
--- a/modules/ocl/src/kernels/arithm_bitwise_xor_scalar.cl
+++ b/modules/ocl/src/kernels/arithm_bitwise_xor_scalar.cl
@@ -64,7 +64,7 @@ __kernel void arithm_s_bitwise_xor_C1_D0 (__global   uchar *src1, int src1_step,
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -98,7 +98,7 @@ __kernel void arithm_s_bitwise_xor_C1_D1 (__global   char *src1, int src1_step,
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -132,7 +132,7 @@ __kernel void arithm_s_bitwise_xor_C1_D2 (__global   ushort *src1, int src1_step
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -163,7 +163,7 @@ __kernel void arithm_s_bitwise_xor_C1_D3 (__global   short *src1, int src1_step,
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -249,7 +249,7 @@ __kernel void arithm_s_bitwise_xor_C1_D6 (__global short *src1, int src1_step, i

        short4 src1_data = *((__global short4 *)((__global char *)src1 + src1_index));
        short4 src2_data = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
-        
+
        short4 tmp_data = src1_data ^ src2_data;

        *((__global short4 *)((__global char *)dst + dst_index)) = tmp_data;
@@ -269,7 +269,7 @@ __kernel void arithm_s_bitwise_xor_C2_D0 (__global   uchar *src1, int src1_step,
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -280,7 +280,7 @@ __kernel void arithm_s_bitwise_xor_C2_D0 (__global   uchar *src1, int src1_step,

        uchar4 data = *((__global uchar4 *)(dst + dst_index));
        uchar4 tmp_data = src1_data ^ src2_data;
-        
+

        data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy;
        data.zw = (dst_index + 2 <  dst_end  ) ? tmp_data.zw : data.zw;
@@ -303,7 +303,7 @@ __kernel void arithm_s_bitwise_xor_C2_D1 (__global   char *src1, int src1_step,
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -311,10 +311,10 @@ __kernel void arithm_s_bitwise_xor_C2_D1 (__global   char *src1, int src1_step,

        char4 src1_data = vload4(0, src1 + src1_index);
        char4 src2_data = (char4)(src2.x, src2.y, src2.x, src2.y);
-        
+
        char4 data = *((__global char4 *)(dst + dst_index));
        char4 tmp_data = src1_data ^ src2_data;
-        
+
        data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy;
        data.zw = (dst_index + 2 <  dst_end  ) ? tmp_data.zw : data.zw;

@@ -339,7 +339,7 @@ __kernel void arithm_s_bitwise_xor_C2_D2 (__global   ushort *src1, int src1_step
        ushort2 src_data2 = (ushort2)(src2.x, src2.y);

        ushort2 data = src_data1 ^ src_data2;
-        
+
        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
    }
 }
@@ -360,7 +360,7 @@ __kernel void arithm_s_bitwise_xor_C2_D3 (__global   short *src1, int src1_step,
        short2 src_data2 = (short2)(src2.x, src2.y);

        short2 data = src_data1 ^ src_data2;
-        
+
        *((__global short2 *)((__global char *)dst + dst_index)) = data;
    }
 }
@@ -401,7 +401,7 @@ __kernel void arithm_s_bitwise_xor_C2_D5 (__global   char *src1, int src1_step,
        char8 src2_data = (char8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);

        char8 tmp_data = src1_data ^ src2_data;
-        
+
        *((__global char8 *)((__global char *)dst + dst_index)) = tmp_data;
      }
 }
@@ -423,7 +423,7 @@ __kernel void arithm_s_bitwise_xor_C2_D6 (__global short *src1, int src1_step, i
        short8 src2_data = (short8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);

        short8 tmp_data = src1_data ^ src2_data;
-        
+
        *((__global short8 *)((__global char *)dst + dst_index)) = tmp_data;
    }
 }
@@ -441,7 +441,7 @@ __kernel void arithm_s_bitwise_xor_C3_D0 (__global   uchar *src1, int src1_step,
        x = x << 2;

        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
-        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); 
+        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -451,9 +451,9 @@ __kernel void arithm_s_bitwise_xor_C3_D0 (__global   uchar *src1, int src1_step,
        uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
        uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);

-        uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x); 
+        uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x);
        uchar4 src2_data_1 = (uchar4)(src2.y, src2.z, src2.x, src2.y);
-        uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z); 
+        uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z);

        uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
        uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
@@ -462,19 +462,19 @@ __kernel void arithm_s_bitwise_xor_C3_D0 (__global   uchar *src1, int src1_step,
        uchar4 tmp_data_0 = src1_data_0 ^ src2_data_0;
        uchar4 tmp_data_1 = src1_data_1 ^ src2_data_1;
        uchar4 tmp_data_2 = src1_data_2 ^ src2_data_2;
-        
+
        data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
-        data_0.w   = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+        data_0.w   = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
                     ? tmp_data_0.w : data_0.w;

-        data_1.xy  = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+        data_1.xy  = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
                     ? tmp_data_1.xy : data_1.xy;
-        data_1.zw  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.zw  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.zw : data_1.zw;

-        data_2.x   = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.x   = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.x : data_2.x;
-        data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) 
+        data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
                     ? tmp_data_2.yzw : data_2.yzw;

        *((__global uchar4 *)(dst + dst_index + 0)) = data_0;
@@ -497,7 +497,7 @@ __kernel void arithm_s_bitwise_xor_C3_D1 (__global   char *src1, int src1_step,
        x = x << 2;

        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
-        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); 
+        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -507,9 +507,9 @@ __kernel void arithm_s_bitwise_xor_C3_D1 (__global   char *src1, int src1_step,
        char4 src1_data_1 = vload4(0, src1 + src1_index + 4);
        char4 src1_data_2 = vload4(0, src1 + src1_index + 8);

-        char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x); 
+        char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x);
        char4 src2_data_1 = (char4)(src2.y, src2.z, src2.x, src2.y);
-        char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z); 
+        char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z);

        char4 data_0 = *((__global char4 *)(dst + dst_index + 0));
        char4 data_1 = *((__global char4 *)(dst + dst_index + 4));
@@ -520,17 +520,17 @@ __kernel void arithm_s_bitwise_xor_C3_D1 (__global   char *src1, int src1_step,
        char4 tmp_data_2 = src1_data_2 ^ src2_data_2;

        data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
-        data_0.w   = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+        data_0.w   = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
                     ? tmp_data_0.w : data_0.w;

-        data_1.xy  = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+        data_1.xy  = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
                     ? tmp_data_1.xy : data_1.xy;
-        data_1.zw  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.zw  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.zw : data_1.zw;

-        data_2.x   = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.x   = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.x : data_2.x;
-        data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) 
+        data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
                     ? tmp_data_2.yzw : data_2.yzw;

        *((__global char4 *)(dst + dst_index + 0)) = data_0;
@@ -552,7 +552,7 @@ __kernel void arithm_s_bitwise_xor_C3_D2 (__global   ushort *src1, int src1_step
        x = x << 1;

        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
-        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); 
+        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -576,12 +576,12 @@ __kernel void arithm_s_bitwise_xor_C3_D2 (__global   ushort *src1, int src1_step

        data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

-        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) 
+        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
                     ? tmp_data_1.x : data_1.x;
-        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.y : data_1.y;

-        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.xy : data_2.xy;

       *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -602,7 +602,7 @@ __kernel void arithm_s_bitwise_xor_C3_D3 (__global   short *src1, int src1_step,
        x = x << 1;

        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
-        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); 
+        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -626,12 +626,12 @@ __kernel void arithm_s_bitwise_xor_C3_D3 (__global   short *src1, int src1_step,

        data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

-        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) 
+        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
                     ? tmp_data_1.x : data_1.x;
-        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.y : data_1.y;

-        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.xy : data_2.xy;

       *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -649,7 +649,7 @@ __kernel void arithm_s_bitwise_xor_C3_D4 (__global   int *src1, int src1_step, i

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); 
+        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));

        int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
@@ -683,16 +683,16 @@ __kernel void arithm_s_bitwise_xor_C3_D5 (__global   char *src1, int src1_step,

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); 
+        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));

        char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0));
        char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4));
        char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8));
-                                             
-        char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3); 
+
+        char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
        char4 src2_data_1 = (char4)(src2.s4, src2.s5, src2.s6, src2.s7);
-        char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB); 
+        char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB);

        char4 data_0 = *((__global char4 *)((__global char *)dst + dst_index + 0));
        char4 data_1 = *((__global char4 *)((__global char *)dst + dst_index + 4));
@@ -718,13 +718,13 @@ __kernel void arithm_s_bitwise_xor_C3_D6 (__global short *src1, int src1_step, i

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); 
+        int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
        int dst_index  = mad24(y, dst_step, dst_offset + (x * 24));

        short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0 ));
        short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8 ));
        short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16));
-                                               
+
        short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
        short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
        short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
@@ -736,7 +736,7 @@ __kernel void arithm_s_bitwise_xor_C3_D6 (__global short *src1, int src1_step, i
        short4 tmp_data_0 = src1_data_0 ^ src2_data_0;
        short4 tmp_data_1 = src1_data_1 ^ src2_data_1;
        short4 tmp_data_2 = src1_data_2 ^ src2_data_2;
-        
+
       *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
       *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
       *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
@@ -864,7 +864,7 @@ __kernel void arithm_s_bitwise_xor_C4_D5 (__global   char *src1, int src1_step,
                                    src2.s8, src2.s9, src2.sa, src2.sb, src2.sc, src2.sd, src2.se, src2.sf);

        char16 tmp_data = src1_data ^ src2_data;
-        
+
        *((__global char16 *)((__global char *)dst + dst_index)) = tmp_data;
    }
 }
@@ -891,17 +891,17 @@ __kernel void arithm_s_bitwise_xor_C4_D6 (__global short *src1, int src1_step, i
        short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
        short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
        short4 src2_data_3 = (short4)(src2.sc, src2.sd, src2.se, src2.sf);
-        
+
        short4 tmp_data_0 = src1_data_0 ^ src2_data_0;
        short4 tmp_data_1 = src1_data_1 ^ src2_data_1;
        short4 tmp_data_2 = src1_data_2 ^ src2_data_2;
        short4 tmp_data_3 = src1_data_3 ^ src2_data_3;
-        
+
       *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
       *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
       *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
       *((__global short4 *)((__global char *)dst + dst_index + 24))= tmp_data_3;
-       
+
    }
 }
 #endif
--- a/modules/ocl/src/kernels/arithm_bitwise_xor_scalar_mask.cl
+++ b/modules/ocl/src/kernels/arithm_bitwise_xor_scalar_mask.cl
@@ -66,7 +66,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D0 (__global   uchar *src1, int
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -104,7 +104,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D1 (__global   char *src1, int s
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -141,7 +141,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D2 (__global   ushort *src1, int
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -154,7 +154,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D2 (__global   ushort *src1, int

        ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
        ushort2 tmp_data = src1_data ^ src2_data;
-        
+
        data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x;
        data.y = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.y : data.y;

@@ -175,7 +175,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D3 (__global   short *src1, int
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -217,7 +217,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D4 (__global   int   *src1, int
        int dst_data  = *((__global int *)((__global char *)dst  + dst_index));

        int data = src_data1 ^ src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global int *)((__global char *)dst + dst_index)) = data;
    }
@@ -245,7 +245,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D5 (__global char *src1, int src
        char4 dst_data  = *((__global char4 *)((__global char *)dst  + dst_index));

        char4 data = src1_data ^ src2_data;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global char4 *)((__global char *)dst + dst_index)) = data;
    }
@@ -274,7 +274,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D6 (__global short *src1, int sr
        short4 dst_data  = *((__global short4 *)((__global char *)dst  + dst_index));

        short4 data = src1_data ^ src2_data;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global short4 *)((__global char *)dst + dst_index)) = data;
    }
@@ -294,7 +294,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D0 (__global   uchar *src1, int
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -330,7 +330,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D1 (__global   char *src1, int s
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -373,7 +373,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D2 (__global   ushort *src1, int
        ushort2 dst_data  = *((__global ushort2 *)((__global char *)dst  + dst_index));

        ushort2 data = src_data1 ^ src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
    }
@@ -400,7 +400,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D3 (__global   short *src1, int
        short2 dst_data  = *((__global short2 *)((__global char *)dst  + dst_index));

        short2 data = src_data1 ^ src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global short2 *)((__global char *)dst + dst_index)) = data;
    }
@@ -427,7 +427,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D4 (__global   int *src1, int sr
        int2 dst_data  = *((__global int2 *)((__global char *)dst  + dst_index));

        int2 data = src_data1 ^ src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global int2 *)((__global char *)dst + dst_index)) = data;
    }
@@ -454,7 +454,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D5 (__global   char *src1, int s
        char8 dst_data = *((__global char8 *)((__global char *)dst  + dst_index));

        char8 data = src1_data ^ src2_data;
-        
+
        data = mask_data ? data : dst_data;

        *((__global char8 *)((__global char *)dst + dst_index)) = data;
@@ -483,7 +483,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D6 (__global short *src1, int sr
        short8 dst_data = *((__global short8 *)((__global char *)dst  + dst_index));

        short8 data = src1_data ^ src2_data;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global short8 *)((__global char *)dst + dst_index)) = data;
    }
@@ -503,7 +503,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D0 (__global   uchar *src1, int
        x = x << 2;

        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
-        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); 
+        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -514,9 +514,9 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D0 (__global   uchar *src1, int
        uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
        uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);

-        uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x); 
+        uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x);
        uchar4 src2_data_1 = (uchar4)(src2.y, src2.z, src2.x, src2.y);
-        uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z); 
+        uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z);

        uchar4 mask_data = vload4(0, mask + mask_index);

@@ -529,17 +529,17 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D0 (__global   uchar *src1, int
        uchar4 tmp_data_2 = src1_data_2 ^ src2_data_2;

        data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
-        data_0.w   = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+        data_0.w   = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
                     ? tmp_data_0.w : data_0.w;

-        data_1.xy  = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+        data_1.xy  = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
                     ? tmp_data_1.xy : data_1.xy;
-        data_1.zw  = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.zw  = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.zw : data_1.zw;

-        data_2.x   = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.x   = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.x : data_2.x;
-        data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) 
+        data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
                     ? tmp_data_2.yzw : data_2.yzw;

        *((__global uchar4 *)(dst + dst_index + 0)) = data_0;
@@ -563,7 +563,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D1 (__global   char *src1, int s
        x = x << 2;

        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
-        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); 
+        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -574,9 +574,9 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D1 (__global   char *src1, int s
        char4 src1_data_1 = vload4(0, src1 + src1_index + 4);
        char4 src1_data_2 = vload4(0, src1 + src1_index + 8);

-        char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x); 
+        char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x);
        char4 src2_data_1 = (char4)(src2.y, src2.z, src2.x, src2.y);
-        char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z); 
+        char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z);

        uchar4 mask_data = vload4(0, mask + mask_index);

@@ -587,19 +587,19 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D1 (__global   char *src1, int s
        char4 tmp_data_0 = src1_data_0 ^ src2_data_0;
        char4 tmp_data_1 = src1_data_1 ^ src2_data_1;
        char4 tmp_data_2 = src1_data_2 ^ src2_data_2;
-        
+
        data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
-        data_0.w   = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+        data_0.w   = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
                     ? tmp_data_0.w : data_0.w;

-        data_1.xy  = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+        data_1.xy  = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
                     ? tmp_data_1.xy : data_1.xy;
-        data_1.zw  = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.zw  = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.zw : data_1.zw;

-        data_2.x   = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.x   = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.x : data_2.x;
-        data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) 
+        data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
                     ? tmp_data_2.yzw : data_2.yzw;

        *((__global char4 *)(dst + dst_index + 0)) = data_0;
@@ -622,7 +622,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D2 (__global   ushort *src1, int
        x = x << 1;

        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
-        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); 
+        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -646,15 +646,15 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D2 (__global   ushort *src1, int
        ushort2 tmp_data_0 = src1_data_0 ^ src2_data_0;
        ushort2 tmp_data_1 = src1_data_1 ^ src2_data_1;
        ushort2 tmp_data_2 = src1_data_2 ^ src2_data_2;
-        
+
        data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

-        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) 
+        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
                     ? tmp_data_1.x : data_1.x;
-        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.y : data_1.y;

-        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.xy : data_2.xy;

       *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -676,7 +676,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D3 (__global   short *src1, int
        x = x << 1;

        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
-        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); 
+        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -703,12 +703,12 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D3 (__global   short *src1, int

        data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

-        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) 
+        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
                     ? tmp_data_1.x : data_1.x;
-        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.y : data_1.y;

-        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.xy : data_2.xy;

       *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -727,7 +727,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D4 (__global   int *src1, int sr

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); 
+        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
        int mask_index = mad24(y, mask_step, x + mask_offset);
        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));

@@ -769,18 +769,18 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D5 (__global   char *src1, int s

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); 
+        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
        int mask_index = mad24(y, mask_step, x + mask_offset);
        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));

        char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0));
        char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4));
        char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8));
-                                             
-        char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3); 
+
+        char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
        char4 src2_data_1 = (char4)(src2.s4, src2.s5, src2.s6, src2.s7);
-        char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB); 
-        
+        char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB);
+
        uchar mask_data = * (mask + mask_index);

        char4 data_0 = *((__global char4 *)((__global char *)dst + dst_index + 0));
@@ -812,18 +812,18 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D6 (__global short *src1, int sr

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); 
+        int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
        int mask_index = mad24(y, mask_step, x + mask_offset);
        int dst_index  = mad24(y, dst_step, dst_offset + (x * 24));

        short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0 ));
        short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8 ));
        short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16));
-                                               
+
        short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
        short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
        short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
-        
+
        uchar mask_data = * (mask + mask_index);

        short4 data_0 = *((__global short4 *)((__global char *)dst + dst_index + 0 ));
@@ -833,7 +833,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D6 (__global short *src1, int sr
        short4 tmp_data_0 = src1_data_0 ^ src2_data_0;
        short4 tmp_data_1 = src1_data_1 ^ src2_data_1;
        short4 tmp_data_2 = src1_data_2 ^ src2_data_2;
-        
+
        data_0 = mask_data ? tmp_data_0 : data_0;
        data_1 = mask_data ? tmp_data_1 : data_1;
        data_2 = mask_data ? tmp_data_2 : data_2;
@@ -865,7 +865,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D0 (__global   uchar *src1, int
        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));

        uchar4 data = src_data1 ^ src2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global uchar4 *)(dst + dst_index)) = data;
    }
@@ -893,7 +893,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D1 (__global   char *src1, int s
        char4 dst_data  = *((__global char4 *)(dst  + dst_index));

        char4 data = src_data1 ^ src2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global char4 *)(dst + dst_index)) = data;
    }
@@ -920,7 +920,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D2 (__global   ushort *src1, int
        ushort4 dst_data  = *((__global ushort4 *)((__global char *)dst  + dst_index));

        ushort4 data = src_data1 ^ src2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
    }
@@ -946,7 +946,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D3 (__global   short *src1, int
        short4 dst_data  = *((__global short4 *)((__global char *)dst  + dst_index));

        short4 data = src_data1 ^ src2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global short4 *)((__global char *)dst + dst_index)) = data;
    }
@@ -972,7 +972,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D4 (__global   int *src1, int sr
        int4 dst_data  = *((__global int4 *)((__global char *)dst  + dst_index));

        int4 data = src_data1 ^ src2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global int4 *)((__global char *)dst + dst_index)) = data;
    }
@@ -1000,7 +1000,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D5 (__global   char *src1, int s
        char16 dst_data  = *((__global char16 *)((__global char *)dst  + dst_index));

        char16 data = src1_data ^ src2_data;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global char16 *)((__global char *)dst + dst_index)) = data;
    }
@@ -1032,7 +1032,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D6 (__global short *src1, int sr
        short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
        short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
        short4 src2_data_3 = (short4)(src2.sc, src2.sd, src2.se, src2.sf);
-        
+
        short4 dst_data_0  = *((__global short4 *)((__global char *)dst  + dst_index + 0));
        short4 dst_data_1  = *((__global short4 *)((__global char *)dst  + dst_index + 8));
        short4 dst_data_2  = *((__global short4 *)((__global char *)dst  + dst_index + 16));
@@ -1042,10 +1042,10 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D6 (__global short *src1, int sr
        short4 data_1 = src1_data_1 ^ src2_data_1;
        short4 data_2 = src1_data_2 ^ src2_data_2;
        short4 data_3 = src1_data_3 ^ src2_data_3;
-	      
-        data_0 = mask_data ? data_0 : dst_data_0; 
-        data_1 = mask_data ? data_1 : dst_data_1; 
-        data_2 = mask_data ? data_2 : dst_data_2; 
+
+        data_0 = mask_data ? data_0 : dst_data_0;
+        data_1 = mask_data ? data_1 : dst_data_1;
+        data_2 = mask_data ? data_2 : dst_data_2;
        data_3 = mask_data ? data_3 : dst_data_3;

        *((__global short4 *)((__global char *)dst + dst_index + 0)) = data_0;
--- a/modules/ocl/src/kernels/arithm_compare_eq.cl
+++ b/modules/ocl/src/kernels/arithm_compare_eq.cl
@@ -63,8 +63,8 @@ __kernel void arithm_compare_eq_D0 (__global uchar *src1, int src1_step, int src
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -99,8 +99,8 @@ __kernel void arithm_compare_eq_D2 (__global ushort *src1, int src1_step, int sr
        x = x << 2;

        #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -137,8 +137,8 @@ __kernel void arithm_compare_eq_D3 (__global short *src1, int src1_step, int src
        x = x << 2;

        #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -170,11 +170,11 @@ __kernel void arithm_compare_eq_D4 (__global int *src1, int src1_step, int src1_
    int y = get_global_id(1);

    if (x < cols && y < rows)
-    {   
+    {
        x = x << 2;
        #define dst_align ((dst_offset >> 2) & 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -206,8 +206,8 @@ __kernel void arithm_compare_eq_D5 (__global float *src1, int src1_step, int src
    {
        x = x << 2;
        #define dst_align ((dst_offset >> 2) & 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -240,8 +240,8 @@ __kernel void arithm_compare_eq_D6 (__global double *src1, int src1_step, int sr
    {
        x = x << 2;
        #define dst_align ((dst_offset >> 3) & 3)
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); 
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); 
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
+        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -276,8 +276,8 @@ __kernel void arithm_compare_gt_D0 (__global uchar *src1, int src1_step, int src
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -312,8 +312,8 @@ __kernel void arithm_compare_gt_D2 (__global ushort *src1, int src1_step, int sr
        x = x << 2;

        #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -350,8 +350,8 @@ __kernel void arithm_compare_gt_D3 (__global short *src1, int src1_step, int src
        x = x << 2;

        #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -384,8 +384,8 @@ __kernel void arithm_compare_gt_D4 (__global int *src1, int src1_step, int src1_
    {
        x = x << 2;
        #define dst_align ((dst_offset >> 2) & 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -417,8 +417,8 @@ __kernel void arithm_compare_gt_D5 (__global float *src1, int src1_step, int src
    {
        x = x << 2;
        #define dst_align ((dst_offset >> 2) & 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -451,8 +451,8 @@ __kernel void arithm_compare_gt_D6 (__global double *src1, int src1_step, int sr
    {
        x = x << 2;
        #define dst_align ((dst_offset >> 3) & 3)
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); 
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); 
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
+        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -487,8 +487,8 @@ __kernel void arithm_compare_ge_D0 (__global uchar *src1, int src1_step, int src
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -525,8 +525,8 @@ __kernel void arithm_compare_ge_D2 (__global ushort *src1, int src1_step, int sr
        x = x << 2;

        #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -563,8 +563,8 @@ __kernel void arithm_compare_ge_D3 (__global short *src1, int src1_step, int src
        x = x << 2;

        #define dst_align ((dst_offset >> 1)& 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -598,8 +598,8 @@ __kernel void arithm_compare_ge_D4 (__global int *src1, int src1_step, int src1_
        x = x << 2;

        #define dst_align ((dst_offset >> 2)& 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -632,8 +632,8 @@ __kernel void arithm_compare_ge_D5 (__global float *src1, int src1_step, int src
        x = x << 2;

        #define dst_align ((dst_offset >> 2)& 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -667,8 +667,8 @@ __kernel void arithm_compare_ge_D6 (__global double *src1, int src1_step, int sr
        x = x << 2;

        #define dst_align ((dst_offset >> 3)& 3)
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); 
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); 
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
+        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
--- a/modules/ocl/src/kernels/arithm_compare_ne.cl
+++ b/modules/ocl/src/kernels/arithm_compare_ne.cl
@@ -59,8 +59,8 @@ __kernel void arithm_compare_ne_D0 (__global uchar *src1, int src1_step, int src
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -97,8 +97,8 @@ __kernel void arithm_compare_ne_D2 (__global ushort *src1, int src1_step, int sr
        x = x << 2;

        #define dst_align ((dst_offset >> 1)& 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -135,8 +135,8 @@ __kernel void arithm_compare_ne_D3 (__global short *src1, int src1_step, int src
        x = x << 2;

        #define dst_align ((dst_offset >> 1)& 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -169,8 +169,8 @@ __kernel void arithm_compare_ne_D4 (__global int *src1, int src1_step, int src1_
    {
        x = x << 2;
        #define dst_align ((dst_offset >> 2)& 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -202,8 +202,8 @@ __kernel void arithm_compare_ne_D5 (__global float *src1, int src1_step, int src
    {
        x = x << 2;
        #define dst_align ((dst_offset >> 2) & 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -236,8 +236,8 @@ __kernel void arithm_compare_ne_D6 (__global double *src1, int src1_step, int sr
    {
        x = x << 2;
        #define dst_align ((dst_offset >> 3) & 3)
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); 
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); 
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
+        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -258,7 +258,7 @@ __kernel void arithm_compare_ne_D6 (__global double *src1, int src1_step, int sr
 }
 #endif

-   
+
 /***********************************Compare LT*******************************/
 __kernel void arithm_compare_lt_D0 (__global uchar *src1, int src1_step, int src1_offset,
                             __global uchar *src2, int src2_step, int src2_offset,
@@ -273,8 +273,8 @@ __kernel void arithm_compare_lt_D0 (__global uchar *src1, int src1_step, int src
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -311,8 +311,8 @@ __kernel void arithm_compare_lt_D2 (__global ushort *src1, int src1_step, int sr
        x = x << 2;

        #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -349,8 +349,8 @@ __kernel void arithm_compare_lt_D3 (__global short *src1, int src1_step, int src
        x = x << 2;

        #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -383,8 +383,8 @@ __kernel void arithm_compare_lt_D4 (__global int *src1, int src1_step, int src1_
    {
        x = x << 2;
        #define dst_align ((dst_offset >> 2) & 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -416,8 +416,8 @@ __kernel void arithm_compare_lt_D5 (__global float *src1, int src1_step, int src
    {
        x = x << 2;
        #define dst_align ((dst_offset >> 2) & 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -450,8 +450,8 @@ __kernel void arithm_compare_lt_D6 (__global double *src1, int src1_step, int sr
    {
        x = x << 2;
        #define dst_align ((dst_offset >> 3) & 3)
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); 
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); 
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
+        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -486,8 +486,8 @@ __kernel void arithm_compare_le_D0 (__global uchar *src1, int src1_step, int src
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -524,8 +524,8 @@ __kernel void arithm_compare_le_D2 (__global ushort *src1, int src1_step, int sr
        x = x << 2;

        #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -562,8 +562,8 @@ __kernel void arithm_compare_le_D3 (__global short *src1, int src1_step, int src
        x = x << 2;

        #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -596,8 +596,8 @@ __kernel void arithm_compare_le_D4 (__global int *src1, int src1_step, int src1_
    {
        x = x << 2;
        #define dst_align ((dst_offset >> 2)& 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -629,8 +629,8 @@ __kernel void arithm_compare_le_D5 (__global float *src1, int src1_step, int src
    {
        x = x << 2;
        #define dst_align ((dst_offset >> 2)& 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -663,8 +663,8 @@ __kernel void arithm_compare_le_D6 (__global double *src1, int src1_step, int sr
    {
        x = x << 2;
        #define dst_align ((dst_offset >> 3)& 3)
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); 
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); 
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
+        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
--- a/modules/ocl/src/kernels/arithm_div.cl
+++ b/modules/ocl/src/kernels/arithm_div.cl
@@ -49,7 +49,7 @@ typedef double F ;
 typedef double4 F4;
 #define convert_F4 convert_double4
 #define convert_F  double
-#else 
+#else
 typedef float F;
 typedef float4 F4;
 #define convert_F4 convert_float4
@@ -102,8 +102,8 @@ __kernel void arithm_div_D0 (__global uchar *src1, int src1_step, int src1_offse
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -143,8 +143,8 @@ __kernel void arithm_div_D2 (__global ushort *src1, int src1_step, int src1_offs
        x = x << 2;

        #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -183,8 +183,8 @@ __kernel void arithm_div_D3 (__global short *src1, int src1_step, int src1_offse
        x = x << 2;

        #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -298,7 +298,7 @@ __kernel void arithm_s_div_D0 (__global uchar *src, int src_step, int src_offset
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src_index = mad24(y, src_step, x + src_offset - dst_align); 
+        int src_index = mad24(y, src_step, x + src_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -334,7 +334,7 @@ __kernel void arithm_s_div_D2 (__global ushort *src, int src_step, int src_offse
        x = x << 2;

        #define dst_align ((dst_offset >> 1) & 3)
-        int src_index = mad24(y, src_step, (x << 1) + src_offset - (dst_align << 1)); 
+        int src_index = mad24(y, src_step, (x << 1) + src_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -369,7 +369,7 @@ __kernel void arithm_s_div_D3 (__global short *src, int src_step, int src_offset
        x = x << 2;

        #define dst_align ((dst_offset >> 1) & 3)
-        int src_index = mad24(y, src_step, (x << 1) + src_offset - (dst_align << 1)); 
+        int src_index = mad24(y, src_step, (x << 1) + src_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
--- a/modules/ocl/src/kernels/arithm_exp.cl
+++ b/modules/ocl/src/kernels/arithm_exp.cl
@@ -84,7 +84,7 @@ __kernel void arithm_exp_D6(int rows, int cols, int srcStep, int dstStep, int sr

      double src_data = *((__global double *)((__global char *)src + srcIdx));
      double dst_data = exp(src_data);
-      
+
      *((__global double *)((__global char *)dst + dstIdx )) = dst_data;
     // dst[dstIdx] = exp(src[srcIdx]);
  }
--- a/modules/ocl/src/kernels/arithm_flip.cl
+++ b/modules/ocl/src/kernels/arithm_flip.cl
@@ -48,7 +48,7 @@
 #endif

 //////////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////////////flip rows/////////////////////////////////////////////// 
+/////////////////////////////////////////////flip rows///////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 __kernel void arithm_flip_rows_D0 (__global uchar *src, int src_step, int src_offset,
                                   __global uchar *dst, int dst_step, int dst_offset,
@@ -62,8 +62,8 @@ __kernel void arithm_flip_rows_D0 (__global uchar *src, int src_step, int src_of
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src_index_0 = mad24(y,            src_step, x + src_offset - dst_align); 
-        int src_index_1 = mad24(rows - y - 1, src_step, x + src_offset - dst_align); 
+        int src_index_0 = mad24(y,            src_step, x + src_offset - dst_align);
+        int src_index_1 = mad24(rows - y - 1, src_step, x + src_offset - dst_align);

        int dst_start_0  = mad24(y,            dst_step, dst_offset);
        int dst_start_1  = mad24(rows - y - 1, dst_step, dst_offset);
@@ -71,22 +71,22 @@ __kernel void arithm_flip_rows_D0 (__global uchar *src, int src_step, int src_of
        int dst_end_1    = mad24(rows - y - 1, dst_step, dst_offset + dst_step1);
        int dst_index_0  = mad24(y,            dst_step, dst_offset + x & (int)0xfffffffc);
        int dst_index_1  = mad24(rows - y - 1, dst_step, dst_offset + x & (int)0xfffffffc);
-		int src1_index_fix = src_index_0 < 0 ? 0 : src_index_0;
-		int src2_index_fix = src_index_1 < 0 ? 0 : src_index_1;
+        int src1_index_fix = src_index_0 < 0 ? 0 : src_index_0;
+        int src2_index_fix = src_index_1 < 0 ? 0 : src_index_1;
        uchar4 src_data_0 = vload4(0, src + src1_index_fix);
        uchar4 src_data_1 = vload4(0, src + src2_index_fix);
-		if(src_index_0 < 0)
-		{
-			uchar4 tmp;
-			tmp.xyzw = (src_index_0 == -2) ? src_data_0.zwxy:src_data_0.yzwx;
-			src_data_0.xyzw = (src_index_0 == -1) ? src_data_0.wxyz:tmp.xyzw;
-		}
-		if(src_index_1 < 0)
-		{
-			uchar4 tmp;
-			tmp.xyzw = (src_index_1 == -2) ? src_data_1.zwxy:src_data_1.yzwx;
-			src_data_1.xyzw = (src_index_1 == -1) ? src_data_1.wxyz:tmp.xyzw;
-		}
+        if(src_index_0 < 0)
+        {
+            uchar4 tmp;
+            tmp.xyzw = (src_index_0 == -2) ? src_data_0.zwxy:src_data_0.yzwx;
+            src_data_0.xyzw = (src_index_0 == -1) ? src_data_0.wxyz:tmp.xyzw;
+        }
+        if(src_index_1 < 0)
+        {
+            uchar4 tmp;
+            tmp.xyzw = (src_index_1 == -2) ? src_data_1.zwxy:src_data_1.yzwx;
+            src_data_1.xyzw = (src_index_1 == -1) ? src_data_1.wxyz:tmp.xyzw;
+        }

        uchar4 dst_data_0 = *((__global uchar4 *)(dst + dst_index_0));
        uchar4 dst_data_1 = *((__global uchar4 *)(dst + dst_index_1));
@@ -117,8 +117,8 @@ __kernel void arithm_flip_rows_D1 (__global char *src, int src_step, int src_off
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src_index_0 = mad24(y,            src_step, x + src_offset - dst_align); 
-        int src_index_1 = mad24(rows - y - 1, src_step, x + src_offset - dst_align); 
+        int src_index_0 = mad24(y,            src_step, x + src_offset - dst_align);
+        int src_index_1 = mad24(rows - y - 1, src_step, x + src_offset - dst_align);

        int dst_start_0  = mad24(y,            dst_step, dst_offset);
        int dst_start_1  = mad24(rows - y - 1, dst_step, dst_offset);
@@ -159,8 +159,8 @@ __kernel void arithm_flip_rows_D2 (__global ushort *src, int src_step, int src_o
        x = x << 2;

        #define dst_align (((dst_offset >> 1) & 3) << 1)
-        int src_index_0 = mad24(y,            src_step, (x << 1) + src_offset - dst_align); 
-        int src_index_1 = mad24(rows - y - 1, src_step, (x << 1) + src_offset - dst_align); 
+        int src_index_0 = mad24(y,            src_step, (x << 1) + src_offset - dst_align);
+        int src_index_1 = mad24(rows - y - 1, src_step, (x << 1) + src_offset - dst_align);

        int dst_start_0  = mad24(y,            dst_step, dst_offset);
        int dst_start_1  = mad24(rows - y - 1, dst_step, dst_offset);
@@ -201,8 +201,8 @@ __kernel void arithm_flip_rows_D3 (__global short *src, int src_step, int src_of
        x = x << 2;

        #define dst_align (((dst_offset >> 1) & 3) << 1)
-        int src_index_0 = mad24(y,            src_step, (x << 1) + src_offset - dst_align); 
-        int src_index_1 = mad24(rows - y - 1, src_step, (x << 1) + src_offset - dst_align); 
+        int src_index_0 = mad24(y,            src_step, (x << 1) + src_offset - dst_align);
+        int src_index_1 = mad24(rows - y - 1, src_step, (x << 1) + src_offset - dst_align);

        int dst_start_0  = mad24(y,            dst_step, dst_offset);
        int dst_start_1  = mad24(rows - y - 1, dst_step, dst_offset);
@@ -243,7 +243,7 @@ __kernel void arithm_flip_rows_D4 (__global int *src, int src_step, int src_offs
    {
        int src_index_0 = mad24(y,            src_step, (x << 2) + src_offset);
        int src_index_1 = mad24(rows - y - 1, src_step, (x << 2) + src_offset);
-        
+
        int dst_index_0 = mad24(y,            dst_step, (x << 2) + dst_offset);
        int dst_index_1 = mad24(rows - y - 1, dst_step, (x << 2) + dst_offset);

@@ -265,7 +265,7 @@ __kernel void arithm_flip_rows_D5 (__global float *src, int src_step, int src_of
    {
        int src_index_0 = mad24(y,            src_step, (x << 2) + src_offset);
        int src_index_1 = mad24(rows - y - 1, src_step, (x << 2) + src_offset);
-        
+
        int dst_index_0 = mad24(y,            dst_step, (x << 2) + dst_offset);
        int dst_index_1 = mad24(rows - y - 1, dst_step, (x << 2) + dst_offset);

@@ -289,7 +289,7 @@ __kernel void arithm_flip_rows_D6 (__global double *src, int src_step, int src_o
    {
        int src_index_0 = mad24(y,            src_step, (x << 3) + src_offset);
        int src_index_1 = mad24(rows - y - 1, src_step, (x << 3) + src_offset);
-        
+
        int dst_index_0 = mad24(y,            dst_step, (x << 3) + dst_offset);
        int dst_index_1 = mad24(rows - y - 1, dst_step, (x << 3) + dst_offset);

@@ -302,7 +302,7 @@ __kernel void arithm_flip_rows_D6 (__global double *src, int src_step, int src_o
 }
 #endif
 //////////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////////////flip cols/////////////////////////////////////////////// 
+/////////////////////////////////////////////flip cols///////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 __kernel void arithm_flip_cols_C1_D0 (__global uchar *src, int src_step, int src_offset,
                                      __global uchar *dst, int dst_step, int dst_offset,
@@ -315,7 +315,7 @@ __kernel void arithm_flip_cols_C1_D0 (__global uchar *src, int src_step, int src
    {
        int src_index_0 = mad24(y, src_step, (x)           + src_offset);
        int src_index_1 = mad24(y, src_step, (cols - x -1) + src_offset);
-        
+
        int dst_index_0 = mad24(y, dst_step, (x)           + dst_offset);
        int dst_index_1 = mad24(y, dst_step, (cols - x -1) + dst_offset);

@@ -337,7 +337,7 @@ __kernel void arithm_flip_cols_C1_D1 (__global char *src, int src_step, int src_
    {
        int src_index_0 = mad24(y, src_step, (x)           + src_offset);
        int src_index_1 = mad24(y, src_step, (cols - x -1) + src_offset);
-        
+
        int dst_index_0 = mad24(y, dst_step, (x)           + dst_offset);
        int dst_index_1 = mad24(y, dst_step, (cols - x -1) + dst_offset);

@@ -359,7 +359,7 @@ __kernel void arithm_flip_cols_C1_D2 (__global ushort *src, int src_step, int sr
    {
        int src_index_0 = mad24(y, src_step, (x << 1)             + src_offset);
        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 1) + src_offset);
-        
+
        int dst_index_0 = mad24(y, dst_step, (x << 1)             + dst_offset);
        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 1) + dst_offset);

@@ -381,7 +381,7 @@ __kernel void arithm_flip_cols_C1_D3 (__global short *src, int src_step, int src
    {
        int src_index_0 = mad24(y, src_step, (x << 1)             + src_offset);
        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 1) + src_offset);
-        
+
        int dst_index_0 = mad24(y, dst_step, (x << 1)             + dst_offset);
        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 1) + dst_offset);

@@ -403,7 +403,7 @@ __kernel void arithm_flip_cols_C1_D4 (__global int *src, int src_step, int src_o
    {
        int src_index_0 = mad24(y, src_step, (x << 2)             + src_offset);
        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 2) + src_offset);
-        
+
        int dst_index_0 = mad24(y, dst_step, (x << 2)             + dst_offset);
        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 2) + dst_offset);

@@ -425,7 +425,7 @@ __kernel void arithm_flip_cols_C1_D5 (__global float *src, int src_step, int src
    {
        int src_index_0 = mad24(y, src_step, (x << 2)             + src_offset);
        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 2) + src_offset);
-        
+
        int dst_index_0 = mad24(y, dst_step, (x << 2)             + dst_offset);
        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 2) + dst_offset);

@@ -449,7 +449,7 @@ __kernel void arithm_flip_cols_C1_D6 (__global double *src, int src_step, int sr
    {
        int src_index_0 = mad24(y, src_step, (x << 3)             + src_offset);
        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 3) + src_offset);
-        
+
        int dst_index_0 = mad24(y, dst_step, (x << 3)             + dst_offset);
        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 3) + dst_offset);

@@ -472,7 +472,7 @@ __kernel void arithm_flip_cols_C2_D0 (__global uchar *src, int src_step, int src
    {
        int src_index_0 = mad24(y, src_step, (x << 1)             + src_offset);
        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 1) + src_offset);
-        
+
        int dst_index_0 = mad24(y, dst_step, (x << 1)             + dst_offset);
        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 1) + dst_offset);

@@ -494,7 +494,7 @@ __kernel void arithm_flip_cols_C2_D1 (__global char *src, int src_step, int src_
    {
        int src_index_0 = mad24(y, src_step, (x << 1)             + src_offset);
        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 1) + src_offset);
-        
+
        int dst_index_0 = mad24(y, dst_step, (x << 1)             + dst_offset);
        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 1) + dst_offset);

@@ -516,7 +516,7 @@ __kernel void arithm_flip_cols_C2_D2 (__global ushort *src, int src_step, int sr
    {
        int src_index_0 = mad24(y, src_step, (x << 2)             + src_offset);
        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 2) + src_offset);
-        
+
        int dst_index_0 = mad24(y, dst_step, (x << 2)             + dst_offset);
        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 2) + dst_offset);

@@ -538,7 +538,7 @@ __kernel void arithm_flip_cols_C2_D3 (__global short *src, int src_step, int src
    {
        int src_index_0 = mad24(y, src_step, (x << 2)             + src_offset);
        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 2) + src_offset);
-        
+
        int dst_index_0 = mad24(y, dst_step, (x << 2)             + dst_offset);
        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 2) + dst_offset);

@@ -560,7 +560,7 @@ __kernel void arithm_flip_cols_C2_D4 (__global int *src, int src_step, int src_o
    {
        int src_index_0 = mad24(y, src_step, (x << 3)             + src_offset);
        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 3) + src_offset);
-        
+
        int dst_index_0 = mad24(y, dst_step, (x << 3)             + dst_offset);
        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 3) + dst_offset);

@@ -582,7 +582,7 @@ __kernel void arithm_flip_cols_C2_D5 (__global float *src, int src_step, int src
    {
        int src_index_0 = mad24(y, src_step, (x << 3)             + src_offset);
        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 3) + src_offset);
-        
+
        int dst_index_0 = mad24(y, dst_step, (x << 3)             + dst_offset);
        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 3) + dst_offset);

@@ -606,7 +606,7 @@ __kernel void arithm_flip_cols_C2_D6 (__global double *src, int src_step, int sr
    {
        int src_index_0 = mad24(y, src_step, (x << 4)             + src_offset);
        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 4) + src_offset);
-        
+
        int dst_index_0 = mad24(y, dst_step, (x << 4)             + dst_offset);
        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 4) + dst_offset);

@@ -630,7 +630,7 @@ __kernel void arithm_flip_cols_C3_D0 (__global uchar *src, int src_step, int src
    {
        int src_index_0 = mad24(y, src_step, (x) * 3           + src_offset);
        int src_index_1 = mad24(y, src_step, (cols - x -1) * 3 + src_offset);
-        
+
        int dst_index_0 = mad24(y, dst_step, (x) * 3           + dst_offset);
        int dst_index_1 = mad24(y, dst_step, (cols - x -1) * 3 + dst_offset);

@@ -662,7 +662,7 @@ __kernel void arithm_flip_cols_C3_D1 (__global char *src, int src_step, int src_
    {
        int src_index_0 = mad24(y, src_step, (x) * 3           + src_offset);
        int src_index_1 = mad24(y, src_step, (cols - x -1) * 3 + src_offset);
-        
+
        int dst_index_0 = mad24(y, dst_step, (x) * 3           + dst_offset);
        int dst_index_1 = mad24(y, dst_step, (cols - x -1) * 3 + dst_offset);

@@ -694,7 +694,7 @@ __kernel void arithm_flip_cols_C3_D2 (__global ushort *src, int src_step, int sr
    {
        int src_index_0 = mad24(y, src_step, (x * 3 << 1)             + src_offset);
        int src_index_1 = mad24(y, src_step, ((cols - x -1) * 3 << 1) + src_offset);
-        
+
        int dst_index_0 = mad24(y, dst_step, (x * 3 << 1)             + dst_offset);
        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) * 3 << 1) + dst_offset);

@@ -726,7 +726,7 @@ __kernel void arithm_flip_cols_C3_D3 (__global short *src, int src_step, int src
    {
        int src_index_0 = mad24(y, src_step, (x * 3 << 1)             + src_offset);
        int src_index_1 = mad24(y, src_step, ((cols - x -1) * 3 << 1) + src_offset);
-        
+
        int dst_index_0 = mad24(y, dst_step, (x * 3 << 1)             + dst_offset);
        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) * 3 << 1) + dst_offset);

@@ -758,14 +758,14 @@ __kernel void arithm_flip_cols_C3_D4 (__global int *src, int src_step, int src_o
    {
        int src_index_0 = mad24(y, src_step, (x * 3 << 2)             + src_offset);
        int src_index_1 = mad24(y, src_step, ((cols - x -1) * 3 << 2) + src_offset);
-        
+
        int dst_index_0 = mad24(y, dst_step, (x * 3 << 2)             + dst_offset);
        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) * 3 << 2) + dst_offset);

        int data0_0 = *((__global int *)((__global char *)src + src_index_0 + 0));
        int data0_1 = *((__global int *)((__global char *)src + src_index_0 + 4));
        int data0_2 = *((__global int *)((__global char *)src + src_index_0 + 8));
-           
+
        int data1_0 = *((__global int *)((__global char *)src + src_index_1 + 0));
        int data1_1 = *((__global int *)((__global char *)src + src_index_1 + 4));
        int data1_2 = *((__global int *)((__global char *)src + src_index_1 + 8));
@@ -773,7 +773,7 @@ __kernel void arithm_flip_cols_C3_D4 (__global int *src, int src_step, int src_o
        *((__global int *)((__global char *)dst + dst_index_0 + 0)) = data1_0;
        *((__global int *)((__global char *)dst + dst_index_0 + 4)) = data1_1;
        *((__global int *)((__global char *)dst + dst_index_0 + 8)) = data1_2;
-                                                                 
+
        *((__global int *)((__global char *)dst + dst_index_1 + 0)) = data0_0;
        *((__global int *)((__global char *)dst + dst_index_1 + 4)) = data0_1;
        *((__global int *)((__global char *)dst + dst_index_1 + 8)) = data0_2;
@@ -790,14 +790,14 @@ __kernel void arithm_flip_cols_C3_D5 (__global float *src, int src_step, int src
    {
        int src_index_0 = mad24(y, src_step, (x * 3 << 2)             + src_offset);
        int src_index_1 = mad24(y, src_step, ((cols - x -1) * 3 << 2) + src_offset);
-        
+
        int dst_index_0 = mad24(y, dst_step, (x * 3 << 2)             + dst_offset);
        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) * 3 << 2) + dst_offset);

        float data0_0 = *((__global float *)((__global char *)src + src_index_0 + 0));
        float data0_1 = *((__global float *)((__global char *)src + src_index_0 + 4));
        float data0_2 = *((__global float *)((__global char *)src + src_index_0 + 8));
-             
+
        float data1_0 = *((__global float *)((__global char *)src + src_index_1 + 0));
        float data1_1 = *((__global float *)((__global char *)src + src_index_1 + 4));
        float data1_2 = *((__global float *)((__global char *)src + src_index_1 + 8));
@@ -805,7 +805,7 @@ __kernel void arithm_flip_cols_C3_D5 (__global float *src, int src_step, int src
        *((__global float *)((__global char *)dst + dst_index_0 + 0)) = data1_0;
        *((__global float *)((__global char *)dst + dst_index_0 + 4)) = data1_1;
        *((__global float *)((__global char *)dst + dst_index_0 + 8)) = data1_2;
-                                                                   
+
        *((__global float *)((__global char *)dst + dst_index_1 + 0)) = data0_0;
        *((__global float *)((__global char *)dst + dst_index_1 + 4)) = data0_1;
        *((__global float *)((__global char *)dst + dst_index_1 + 8)) = data0_2;
@@ -824,14 +824,14 @@ __kernel void arithm_flip_cols_C3_D6 (__global double *src, int src_step, int sr
    {
        int src_index_0 = mad24(y, src_step, (x * 3 << 3)             + src_offset);
        int src_index_1 = mad24(y, src_step, ((cols - x -1) * 3 << 3) + src_offset);
-        
+
        int dst_index_0 = mad24(y, dst_step, (x * 3 << 3)             + dst_offset);
        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) * 3 << 3) + dst_offset);

        double data0_0 = *((__global double *)((__global char *)src + src_index_0 + 0));
        double data0_1 = *((__global double *)((__global char *)src + src_index_0 + 8));
        double data0_2 = *((__global double *)((__global char *)src + src_index_0 + 16));
-              
+
        double data1_0 = *((__global double *)((__global char *)src + src_index_1 + 0));
        double data1_1 = *((__global double *)((__global char *)src + src_index_1 + 8));
        double data1_2 = *((__global double *)((__global char *)src + src_index_1 + 16));
@@ -839,7 +839,7 @@ __kernel void arithm_flip_cols_C3_D6 (__global double *src, int src_step, int sr
        *((__global double *)((__global char *)dst + dst_index_0 + 0 )) = data1_0;
        *((__global double *)((__global char *)dst + dst_index_0 + 8 )) = data1_1;
        *((__global double *)((__global char *)dst + dst_index_0 + 16)) = data1_2;
-                                                                    
+
        *((__global double *)((__global char *)dst + dst_index_1 + 0 )) = data0_0;
        *((__global double *)((__global char *)dst + dst_index_1 + 8 )) = data0_1;
        *((__global double *)((__global char *)dst + dst_index_1 + 16)) = data0_2;
@@ -857,7 +857,7 @@ __kernel void arithm_flip_cols_C4_D0 (__global uchar *src, int src_step, int src
    {
        int src_index_0 = mad24(y, src_step, (x << 2)             + src_offset);
        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 2) + src_offset);
-        
+
        int dst_index_0 = mad24(y, dst_step, (x << 2)             + dst_offset);
        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 2) + dst_offset);

@@ -879,7 +879,7 @@ __kernel void arithm_flip_cols_C4_D1 (__global char *src, int src_step, int src_
    {
        int src_index_0 = mad24(y, src_step, (x << 2)             + src_offset);
        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 2) + src_offset);
-        
+
        int dst_index_0 = mad24(y, dst_step, (x << 2)             + dst_offset);
        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 2) + dst_offset);

@@ -901,7 +901,7 @@ __kernel void arithm_flip_cols_C4_D2 (__global ushort *src, int src_step, int sr
    {
        int src_index_0 = mad24(y, src_step, (x << 3)             + src_offset);
        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 3) + src_offset);
-        
+
        int dst_index_0 = mad24(y, dst_step, (x << 3)             + dst_offset);
        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 3) + dst_offset);

@@ -923,7 +923,7 @@ __kernel void arithm_flip_cols_C4_D3 (__global short *src, int src_step, int src
    {
        int src_index_0 = mad24(y, src_step, (x << 3)             + src_offset);
        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 3) + src_offset);
-        
+
        int dst_index_0 = mad24(y, dst_step, (x << 3)             + dst_offset);
        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 3) + dst_offset);

@@ -946,7 +946,7 @@ __kernel void arithm_flip_cols_C4_D4 (__global int *src, int src_step, int src_o
    {
        int src_index_0 = mad24(y, src_step, (x << 4)             + src_offset);
        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 4) + src_offset);
-        
+
        int dst_index_0 = mad24(y, dst_step, (x << 4)             + dst_offset);
        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 4) + dst_offset);

@@ -968,7 +968,7 @@ __kernel void arithm_flip_cols_C4_D5 (__global float *src, int src_step, int src
    {
        int src_index_0 = mad24(y, src_step, (x << 4)             + src_offset);
        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 4) + src_offset);
-        
+
        int dst_index_0 = mad24(y, dst_step, (x << 4)             + dst_offset);
        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 4) + dst_offset);

@@ -991,7 +991,7 @@ __kernel void arithm_flip_cols_C4_D6 (__global double *src, int src_step, int sr
    {
        int src_index_0 = mad24(y, src_step, (x << 5)             + src_offset);
        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 5) + src_offset);
-        
+
        int dst_index_0 = mad24(y, dst_step, (x << 5)             + dst_offset);
        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 5) + dst_offset);

--- a/modules/ocl/src/kernels/arithm_flip_rc.cl
+++ b/modules/ocl/src/kernels/arithm_flip_rc.cl
@@ -60,7 +60,7 @@ __kernel void arithm_flip_rc_C1_D0 (__global uchar *src, int src_step, int src_o
    {
        int src_index_0 = mad24(y,            src_step, (x)           + src_offset);
        int src_index_1 = mad24(rows - y - 1, src_step, (cols - x -1) + src_offset);
-        
+
        int dst_index_0 = mad24(y,            dst_step, (x)           + dst_offset);
        int dst_index_1 = mad24(rows - y - 1, dst_step, (cols - x -1) + dst_offset);

@@ -82,7 +82,7 @@ __kernel void arithm_flip_rc_C1_D1 (__global char *src, int src_step, int src_of
    {
        int src_index_0 = mad24(y,            src_step, (x)           + src_offset);
        int src_index_1 = mad24(rows - y - 1, src_step, (cols - x -1) + src_offset);
-        
+
        int dst_index_0 = mad24(y,            dst_step, (x)           + dst_offset);
        int dst_index_1 = mad24(rows - y - 1, dst_step, (cols - x -1) + dst_offset);

@@ -104,7 +104,7 @@ __kernel void arithm_flip_rc_C1_D2 (__global ushort *src, int src_step, int src_
    {
        int src_index_0 = mad24(y,            src_step, (x << 1)             + src_offset);
        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 1) + src_offset);
-        
+
        int dst_index_0 = mad24(y,            dst_step, (x << 1)             + dst_offset);
        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 1) + dst_offset);

@@ -126,7 +126,7 @@ __kernel void arithm_flip_rc_C1_D3 (__global short *src, int src_step, int src_o
    {
        int src_index_0 = mad24(y,            src_step, (x << 1)             + src_offset);
        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 1) + src_offset);
-        
+
        int dst_index_0 = mad24(y,            dst_step, (x << 1)             + dst_offset);
        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 1) + dst_offset);

@@ -148,7 +148,7 @@ __kernel void arithm_flip_rc_C1_D4 (__global int *src, int src_step, int src_off
    {
        int src_index_0 = mad24(y,            src_step, (x << 2)             + src_offset);
        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 2) + src_offset);
-        
+
        int dst_index_0 = mad24(y,            dst_step, (x << 2)             + dst_offset);
        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 2) + dst_offset);

@@ -170,7 +170,7 @@ __kernel void arithm_flip_rc_C1_D5 (__global float *src, int src_step, int src_o
    {
        int src_index_0 = mad24(y,            src_step, (x << 2)             + src_offset);
        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 2) + src_offset);
-        
+
        int dst_index_0 = mad24(y,            dst_step, (x << 2)             + dst_offset);
        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 2) + dst_offset);

@@ -194,7 +194,7 @@ __kernel void arithm_flip_rc_C1_D6 (__global double *src, int src_step, int src_
    {
        int src_index_0 = mad24(y,            src_step, (x << 3)             + src_offset);
        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 3) + src_offset);
-        
+
        int dst_index_0 = mad24(y,            dst_step, (x << 3)             + dst_offset);
        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 3) + dst_offset);

@@ -217,7 +217,7 @@ __kernel void arithm_flip_rc_C2_D0 (__global uchar *src, int src_step, int src_o
    {
        int src_index_0 = mad24(y,            src_step, (x << 1)             + src_offset);
        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 1) + src_offset);
-        
+
        int dst_index_0 = mad24(y,            dst_step, (x << 1)             + dst_offset);
        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 1) + dst_offset);

@@ -239,7 +239,7 @@ __kernel void arithm_flip_rc_C2_D1 (__global char *src, int src_step, int src_of
    {
        int src_index_0 = mad24(y,            src_step, (x << 1)             + src_offset);
        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 1) + src_offset);
-        
+
        int dst_index_0 = mad24(y,            dst_step, (x << 1)             + dst_offset);
        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 1) + dst_offset);

@@ -261,7 +261,7 @@ __kernel void arithm_flip_rc_C2_D2 (__global ushort *src, int src_step, int src_
    {
        int src_index_0 = mad24(y,            src_step, (x << 2)             + src_offset);
        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 2) + src_offset);
-        
+
        int dst_index_0 = mad24(y,            dst_step, (x << 2)             + dst_offset);
        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 2) + dst_offset);

@@ -283,7 +283,7 @@ __kernel void arithm_flip_rc_C2_D3 (__global short *src, int src_step, int src_o
    {
        int src_index_0 = mad24(y,            src_step, (x << 2)             + src_offset);
        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 2) + src_offset);
-        
+
        int dst_index_0 = mad24(y,            dst_step, (x << 2)             + dst_offset);
        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 2) + dst_offset);

@@ -305,7 +305,7 @@ __kernel void arithm_flip_rc_C2_D4 (__global int *src, int src_step, int src_off
    {
        int src_index_0 = mad24(y,            src_step, (x << 3)             + src_offset);
        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 3) + src_offset);
-        
+
        int dst_index_0 = mad24(y,            dst_step, (x << 3)             + dst_offset);
        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 3) + dst_offset);

@@ -327,7 +327,7 @@ __kernel void arithm_flip_rc_C2_D5 (__global float *src, int src_step, int src_o
    {
        int src_index_0 = mad24(y,            src_step, (x << 3)             + src_offset);
        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 3) + src_offset);
-        
+
        int dst_index_0 = mad24(y,            dst_step, (x << 3)             + dst_offset);
        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 3) + dst_offset);

@@ -351,7 +351,7 @@ __kernel void arithm_flip_rc_C2_D6 (__global double *src, int src_step, int src_
    {
        int src_index_0 = mad24(y,            src_step, (x << 4)             + src_offset);
        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 4) + src_offset);
-        
+
        int dst_index_0 = mad24(y,            dst_step, (x << 4)             + dst_offset);
        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 4) + dst_offset);

@@ -375,7 +375,7 @@ __kernel void arithm_flip_rc_C3_D0 (__global uchar *src, int src_step, int src_o
    {
        int src_index_0 = mad24(y,            src_step, (x * 3)            + src_offset);
        int src_index_1 = mad24(rows - y - 1, src_step, (cols - x -1) * 3  + src_offset);
-        
+
        int dst_index_0 = mad24(y,            dst_step, (x * 3)           + dst_offset);
        int dst_index_1 = mad24(rows - y - 1, dst_step, (cols - x -1) * 3 + dst_offset);

@@ -408,7 +408,7 @@ __kernel void arithm_flip_rc_C3_D1 (__global char *src, int src_step, int src_of
    {
        int src_index_0 = mad24(y,            src_step, (x * 3)            + src_offset);
        int src_index_1 = mad24(rows - y - 1, src_step, (cols - x -1) * 3  + src_offset);
-        
+
        int dst_index_0 = mad24(y,            dst_step, (x * 3)           + dst_offset);
        int dst_index_1 = mad24(rows - y - 1, dst_step, (cols - x -1) * 3 + dst_offset);

@@ -441,7 +441,7 @@ __kernel void arithm_flip_rc_C3_D2 (__global ushort *src, int src_step, int src_
    {
        int src_index_0 = mad24(y,            src_step, (x * 3 << 1)             + src_offset);
        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) * 3 << 1) + src_offset);
-        
+
        int dst_index_0 = mad24(y,            dst_step, (x * 3 << 1)             + dst_offset);
        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) * 3 << 1) + dst_offset);

@@ -473,7 +473,7 @@ __kernel void arithm_flip_rc_C3_D3 (__global short *src, int src_step, int src_o
    {
        int src_index_0 = mad24(y,            src_step, (x * 3 << 1)             + src_offset);
        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) * 3 << 1) + src_offset);
-        
+
        int dst_index_0 = mad24(y,            dst_step, (x * 3 << 1)             + dst_offset);
        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) * 3 << 1) + dst_offset);

@@ -506,14 +506,14 @@ __kernel void arithm_flip_rc_C3_D4 (__global int *src, int src_step, int src_off
    {
        int src_index_0 = mad24(y,            src_step, (x * 3 << 2)             + src_offset);
        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) * 3 << 2) + src_offset);
-        
+
        int dst_index_0 = mad24(y,            dst_step, (x * 3 << 2)             + dst_offset);
        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) * 3 << 2) + dst_offset);

        int data0_0 = *((__global int *)((__global char *)src + src_index_0 + 0));
        int data0_1 = *((__global int *)((__global char *)src + src_index_0 + 4));
        int data0_2 = *((__global int *)((__global char *)src + src_index_0 + 8));
-           
+
        int data1_0 = *((__global int *)((__global char *)src + src_index_1 + 0));
        int data1_1 = *((__global int *)((__global char *)src + src_index_1 + 4));
        int data1_2 = *((__global int *)((__global char *)src + src_index_1 + 8));
@@ -521,7 +521,7 @@ __kernel void arithm_flip_rc_C3_D4 (__global int *src, int src_step, int src_off
        *((__global int *)((__global char *)dst + dst_index_0 + 0)) = data1_0;
        *((__global int *)((__global char *)dst + dst_index_0 + 4)) = data1_1;
        *((__global int *)((__global char *)dst + dst_index_0 + 8)) = data1_2;
-                                                                 
+
        *((__global int *)((__global char *)dst + dst_index_1 + 0)) = data0_0;
        *((__global int *)((__global char *)dst + dst_index_1 + 4)) = data0_1;
        *((__global int *)((__global char *)dst + dst_index_1 + 8)) = data0_2;
@@ -538,14 +538,14 @@ __kernel void arithm_flip_rc_C3_D5 (__global float *src, int src_step, int src_o
    {
        int src_index_0 = mad24(y,            src_step, (x * 3 << 2)             + src_offset);
        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) * 3 << 2) + src_offset);
-        
+
        int dst_index_0 = mad24(y,            dst_step, (x * 3 << 2)             + dst_offset);
        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) * 3 << 2) + dst_offset);

        float data0_0 = *((__global float *)((__global char *)src + src_index_0 + 0));
        float data0_1 = *((__global float *)((__global char *)src + src_index_0 + 4));
        float data0_2 = *((__global float *)((__global char *)src + src_index_0 + 8));
-                                                                                   
+
        float data1_0 = *((__global float *)((__global char *)src + src_index_1 + 0));
        float data1_1 = *((__global float *)((__global char *)src + src_index_1 + 4));
        float data1_2 = *((__global float *)((__global char *)src + src_index_1 + 8));
@@ -553,7 +553,7 @@ __kernel void arithm_flip_rc_C3_D5 (__global float *src, int src_step, int src_o
        *((__global float *)((__global char *)dst + dst_index_0 + 0)) = data1_0;
        *((__global float *)((__global char *)dst + dst_index_0 + 4)) = data1_1;
        *((__global float *)((__global char *)dst + dst_index_0 + 8)) = data1_2;
-                                                                   
+
        *((__global float *)((__global char *)dst + dst_index_1 + 0)) = data0_0;
        *((__global float *)((__global char *)dst + dst_index_1 + 4)) = data0_1;
        *((__global float *)((__global char *)dst + dst_index_1 + 8)) = data0_2;
@@ -572,14 +572,14 @@ __kernel void arithm_flip_rc_C3_D6 (__global double *src, int src_step, int src_
    {
        int src_index_0 = mad24(y,            src_step, (x * 3 << 3)             + src_offset);
        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) * 3 << 3) + src_offset);
-        
+
        int dst_index_0 = mad24(y,            dst_step, (x * 3 << 3)             + dst_offset);
        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) * 3 << 3) + dst_offset);

        double data0_0 = *((__global double *)((__global char *)src + src_index_0 + 0 ));
        double data0_1 = *((__global double *)((__global char *)src + src_index_0 + 8 ));
        double data0_2 = *((__global double *)((__global char *)src + src_index_0 + 16));
-              
+
        double data1_0 = *((__global double *)((__global char *)src + src_index_1 + 0 ));
        double data1_1 = *((__global double *)((__global char *)src + src_index_1 + 8 ));
        double data1_2 = *((__global double *)((__global char *)src + src_index_1 + 16));
@@ -587,7 +587,7 @@ __kernel void arithm_flip_rc_C3_D6 (__global double *src, int src_step, int src_
        *((__global double *)((__global char *)dst + dst_index_0 + 0 )) = data1_0;
        *((__global double *)((__global char *)dst + dst_index_0 + 8 )) = data1_1;
        *((__global double *)((__global char *)dst + dst_index_0 + 16)) = data1_2;
-                                                                     
+
        *((__global double *)((__global char *)dst + dst_index_1 + 0 )) = data0_0;
        *((__global double *)((__global char *)dst + dst_index_1 + 8 )) = data0_1;
        *((__global double *)((__global char *)dst + dst_index_1 + 16)) = data0_2;
@@ -605,7 +605,7 @@ __kernel void arithm_flip_rc_C4_D0 (__global uchar *src, int src_step, int src_o
    {
        int src_index_0 = mad24(y,            src_step, (x << 2)             + src_offset);
        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 2) + src_offset);
-        
+
        int dst_index_0 = mad24(y,            dst_step, (x << 2)             + dst_offset);
        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 2) + dst_offset);

@@ -627,7 +627,7 @@ __kernel void arithm_flip_rc_C4_D1 (__global char *src, int src_step, int src_of
    {
        int src_index_0 = mad24(y,            src_step, (x << 2)             + src_offset);
        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 2) + src_offset);
-        
+
        int dst_index_0 = mad24(y,            dst_step, (x << 2)             + dst_offset);
        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 2) + dst_offset);

@@ -649,7 +649,7 @@ __kernel void arithm_flip_rc_C4_D2 (__global ushort *src, int src_step, int src_
    {
        int src_index_0 = mad24(y,            src_step, (x << 3)             + src_offset);
        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 3) + src_offset);
-        
+
        int dst_index_0 = mad24(y,            dst_step, (x << 3)             + dst_offset);
        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 3) + dst_offset);

@@ -671,7 +671,7 @@ __kernel void arithm_flip_rc_C4_D3 (__global short *src, int src_step, int src_o
    {
        int src_index_0 = mad24(y,            src_step, (x << 3)             + src_offset);
        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 3) + src_offset);
-        
+
        int dst_index_0 = mad24(y,            dst_step, (x << 3)             + dst_offset);
        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 3) + dst_offset);

@@ -693,7 +693,7 @@ __kernel void arithm_flip_rc_C4_D4 (__global int *src, int src_step, int src_off
    {
        int src_index_0 = mad24(y,            src_step, (x << 4)             + src_offset);
        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 4) + src_offset);
-        
+
        int dst_index_0 = mad24(y,            dst_step, (x << 4)             + dst_offset);
        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 4) + dst_offset);

@@ -715,7 +715,7 @@ __kernel void arithm_flip_rc_C4_D5 (__global float *src, int src_step, int src_o
    {
        int src_index_0 = mad24(y,            src_step, (x << 4)             + src_offset);
        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 4) + src_offset);
-        
+
        int dst_index_0 = mad24(y,            dst_step, (x << 4)             + dst_offset);
        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 4) + dst_offset);

@@ -739,7 +739,7 @@ __kernel void arithm_flip_rc_C4_D6 (__global double *src, int src_step, int src_
    {
        int src_index_0 = mad24(y,            src_step, (x << 5)             + src_offset);
        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 5) + src_offset);
-        
+
        int dst_index_0 = mad24(y,            dst_step, (x << 5)             + dst_offset);
        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 5) + dst_offset);

--- a/modules/ocl/src/kernels/arithm_log.cl
+++ b/modules/ocl/src/kernels/arithm_log.cl
@@ -48,7 +48,7 @@
 #endif

 #define INF_FLOAT -88.029694
-#define INF_DOUBLE -709.0895657128241 
+#define INF_DOUBLE -709.0895657128241


 //////////////////////////////////////////////////////////////////////////////////////////////////////
--- a/modules/ocl/src/kernels/arithm_magnitudeSqr.cl
+++ b/modules/ocl/src/kernels/arithm_magnitudeSqr.cl
@@ -60,17 +60,17 @@ __kernel void magnitudeSqr_C1_D5 (__global float *src1,int src1_step,int src1_of
    int y = get_global_id(1);

    if (x < cols && y < rows)
-    
+

    {
-            
+
        x = x << 2;

        #define dst_align ((dst_offset >> 2) & 3)

-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
-       
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
+
        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
@@ -112,16 +112,16 @@ __kernel void magnitudeSqr_C2_D5 (__global float *src1,int src1_step,int src1_of
    int y = get_global_id(1);

    if (x < cols && y < rows)
-    
+

    {
-            
+
        x = x << 2;

        #define dst_align ((dst_offset >> 2) & 3)

-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); 
-       
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
+
        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
--- a/modules/ocl/src/kernels/arithm_minMax.cl
+++ b/modules/ocl/src/kernels/arithm_minMax.cl
@@ -57,37 +57,37 @@
 #if defined (DEPTH_1)
 #define VEC_TYPE char8
 #define CONVERT_TYPE convert_char8
-#define MIN_VAL -128 
+#define MIN_VAL -128
 #define MAX_VAL 127
 #endif
 #if defined (DEPTH_2)
 #define VEC_TYPE ushort8
 #define CONVERT_TYPE convert_ushort8
-#define MIN_VAL 0 
+#define MIN_VAL 0
 #define MAX_VAL 65535
 #endif
 #if defined (DEPTH_3)
 #define VEC_TYPE short8
 #define CONVERT_TYPE convert_short8
-#define MIN_VAL -32768 
+#define MIN_VAL -32768
 #define MAX_VAL 32767
 #endif
 #if defined (DEPTH_4)
 #define VEC_TYPE int8
 #define CONVERT_TYPE convert_int8
-#define MIN_VAL INT_MIN 
+#define MIN_VAL INT_MIN
 #define MAX_VAL INT_MAX
 #endif
 #if defined (DEPTH_5)
 #define VEC_TYPE float8
 #define CONVERT_TYPE convert_float8
-#define MIN_VAL (-FLT_MAX) 
+#define MIN_VAL (-FLT_MAX)
 #define MAX_VAL FLT_MAX
 #endif
 #if defined (DEPTH_6)
 #define VEC_TYPE double8
 #define CONVERT_TYPE convert_double8
-#define MIN_VAL (-DBL_MAX) 
+#define MIN_VAL (-DBL_MAX)
 #define MAX_VAL DBL_MAX
 #endif

@@ -157,7 +157,7 @@ __kernel void arithm_op_minMax (int cols,int invalid_cols,int offset,int elemnum
   if(id < elemnum)
   {
       temp = src[idx];
-       if(id % cols == 0 ) 
+       if(id % cols == 0 )
       {
           repeat_s(temp);
       }
@@ -177,7 +177,7 @@ __kernel void arithm_op_minMax (int cols,int invalid_cols,int offset,int elemnum
   {
       idx = offset + id + (id / cols) * invalid_cols;
       temp = src[idx];
-       if(id % cols == 0 ) 
+       if(id % cols == 0 )
       {
               repeat_s(temp);
       }
--- a/modules/ocl/src/kernels/arithm_minMaxLoc.cl
+++ b/modules/ocl/src/kernels/arithm_minMaxLoc.cl
@@ -66,7 +66,7 @@
 #define VEC_TYPE_LOC int4
 #define CONVERT_TYPE convert_char4
 #define CONDITION_FUNC(a,b,c) (convert_int4(a) ? b : c)
-#define MIN_VAL -128 
+#define MIN_VAL -128
 #define MAX_VAL 127
 #endif
 #if defined (DEPTH_2)
@@ -74,7 +74,7 @@
 #define VEC_TYPE_LOC int4
 #define CONVERT_TYPE convert_ushort4
 #define CONDITION_FUNC(a,b,c) (convert_int4(a) ? b : c)
-#define MIN_VAL 0 
+#define MIN_VAL 0
 #define MAX_VAL 65535
 #endif
 #if defined (DEPTH_3)
@@ -82,7 +82,7 @@
 #define VEC_TYPE_LOC int4
 #define CONVERT_TYPE convert_short4
 #define CONDITION_FUNC(a,b,c) (convert_int4(a) ? b : c)
-#define MIN_VAL -32768 
+#define MIN_VAL -32768
 #define MAX_VAL 32767
 #endif
 #if defined (DEPTH_4)
@@ -90,7 +90,7 @@
 #define VEC_TYPE_LOC int4
 #define CONVERT_TYPE convert_int4
 #define CONDITION_FUNC(a,b,c) ((a) ? b : c)
-#define MIN_VAL INT_MIN 
+#define MIN_VAL INT_MIN
 #define MAX_VAL INT_MAX
 #endif
 #if defined (DEPTH_5)
@@ -98,7 +98,7 @@
 #define VEC_TYPE_LOC float4
 #define CONVERT_TYPE convert_float4
 #define CONDITION_FUNC(a,b,c) ((a) ? b : c)
-#define MIN_VAL (-FLT_MAX) 
+#define MIN_VAL (-FLT_MAX)
 #define MAX_VAL FLT_MAX
 #endif
 #if defined (DEPTH_6)
@@ -106,12 +106,12 @@
 #define VEC_TYPE_LOC double4
 #define CONVERT_TYPE convert_double4
 #define CONDITION_FUNC(a,b,c) ((a) ? b : c)
-#define MIN_VAL (-DBL_MAX) 
+#define MIN_VAL (-DBL_MAX)
 #define MAX_VAL DBL_MAX
 #endif

 #if defined (REPEAT_S0)
-#define repeat_s(a) a=a; 
+#define repeat_s(a) a=a;
 #endif
 #if defined (REPEAT_S1)
 #define repeat_s(a) a.s0 = a.s1;
@@ -125,7 +125,7 @@


 #if defined (REPEAT_E0)
-#define repeat_e(a) a=a; 
+#define repeat_e(a) a=a;
 #endif
 #if defined (REPEAT_E1)
 #define repeat_e(a) a.s3 = a.s2;
@@ -159,7 +159,7 @@ __kernel void arithm_op_minMaxLoc (int cols,int invalid_cols,int offset,int elem
       temp = src[idx];
       idx_c = idx << 2;
       temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3);
-       if(id % cols == 0 ) 
+       if(id % cols == 0 )
       {
           repeat_s(temp);
           repeat_s(temploc);
@@ -188,7 +188,7 @@ __kernel void arithm_op_minMaxLoc (int cols,int invalid_cols,int offset,int elem
       temp = src[idx];
       idx_c = idx << 2;
       temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3);
-       if(id % cols == 0 ) 
+       if(id % cols == 0 )
       {
               repeat_s(temp);
               repeat_s(temploc);
@@ -228,9 +228,9 @@ __kernel void arithm_op_minMaxLoc (int cols,int invalid_cols,int offset,int elem
           int lid2 = lsize + lid;
           localmem_min[lid] = min(localmem_min[lid] , localmem_min[lid2]);
           localmem_max[lid] = max(localmem_max[lid] , localmem_max[lid2]);
-           localmem_minloc[lid] = 
+           localmem_minloc[lid] =
                   CONDITION_FUNC(localmem_min[lid] == localmem_min[lid2], localmem_minloc[lid2] , localmem_minloc[lid]);
-           localmem_maxloc[lid] = 
+           localmem_maxloc[lid] =
                   CONDITION_FUNC(localmem_max[lid] == localmem_max[lid2], localmem_maxloc[lid2] , localmem_maxloc[lid]);
       }
       barrier(CLK_LOCAL_MEM_FENCE);
@@ -291,7 +291,7 @@ __kernel void arithm_op_minMaxLoc_mask (int cols,int invalid_cols,int offset,int
       m_temp = CONVERT_TYPE(mask[midx]);
       int idx_c = idx << 2;
       temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3);
-       if(id % cols == 0 ) 
+       if(id % cols == 0 )
       {
           repeat_ms(m_temp);
           repeat_s(temploc);
@@ -321,7 +321,7 @@ __kernel void arithm_op_minMaxLoc_mask (int cols,int invalid_cols,int offset,int
       m_temp = CONVERT_TYPE(mask[midx]);
       int idx_c = idx << 2;
       temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3);
-       if(id % cols == 0 ) 
+       if(id % cols == 0 )
       {
           repeat_ms(m_temp);
           repeat_s(temploc);
@@ -333,7 +333,7 @@ __kernel void arithm_op_minMaxLoc_mask (int cols,int invalid_cols,int offset,int
       }
       minval = min(minval,m_temp > zero ? temp : max_val);
       maxval = max(maxval,m_temp > zero ? temp : min_val);
-       
+
       temploc = CONDITION_FUNC(m_temp > zero, temploc , negative);
       minloc = CONDITION_FUNC(minval == temp, temploc , minloc);
       maxloc = CONDITION_FUNC(maxval == temp, temploc , maxloc);
@@ -361,9 +361,9 @@ __kernel void arithm_op_minMaxLoc_mask (int cols,int invalid_cols,int offset,int
           int lid2 = lsize + lid;
           localmem_min[lid] = min(localmem_min[lid] , localmem_min[lid2]);
           localmem_max[lid] = max(localmem_max[lid] , localmem_max[lid2]);
-           localmem_minloc[lid] = 
+           localmem_minloc[lid] =
                   CONDITION_FUNC(localmem_min[lid] == localmem_min[lid2], localmem_minloc[lid2] , localmem_minloc[lid]);
-           localmem_maxloc[lid] = 
+           localmem_maxloc[lid] =
                   CONDITION_FUNC(localmem_max[lid] == localmem_max[lid2], localmem_maxloc[lid2] , localmem_maxloc[lid]);
       }
       barrier(CLK_LOCAL_MEM_FENCE);
--- a/modules/ocl/src/kernels/arithm_minMaxLoc_mask.cl
+++ b/modules/ocl/src/kernels/arithm_minMaxLoc_mask.cl
@@ -68,7 +68,7 @@
 #define VEC_TYPE_LOC int4
 #define CONVERT_TYPE convert_char4
 #define CONDITION_FUNC(a,b,c) (convert_int4(a) ? b : c)
-#define MIN_VAL -128 
+#define MIN_VAL -128
 #define MAX_VAL 127
 #endif
 #if defined (DEPTH_2)
@@ -77,7 +77,7 @@
 #define VEC_TYPE_LOC int4
 #define CONVERT_TYPE convert_ushort4
 #define CONDITION_FUNC(a,b,c) (convert_int4(a) ? b : c)
-#define MIN_VAL 0 
+#define MIN_VAL 0
 #define MAX_VAL 65535
 #endif
 #if defined (DEPTH_3)
@@ -86,7 +86,7 @@
 #define VEC_TYPE_LOC int4
 #define CONVERT_TYPE convert_short4
 #define CONDITION_FUNC(a,b,c) (convert_int4(a) ? b : c)
-#define MIN_VAL -32768 
+#define MIN_VAL -32768
 #define MAX_VAL 32767
 #endif
 #if defined (DEPTH_4)
@@ -95,7 +95,7 @@
 #define VEC_TYPE_LOC int4
 #define CONVERT_TYPE convert_int4
 #define CONDITION_FUNC(a,b,c) ((a) ? b : c)
-#define MIN_VAL INT_MIN 
+#define MIN_VAL INT_MIN
 #define MAX_VAL INT_MAX
 #endif
 #if defined (DEPTH_5)
@@ -104,7 +104,7 @@
 #define VEC_TYPE_LOC float4
 #define CONVERT_TYPE convert_float4
 #define CONDITION_FUNC(a,b,c) ((a) ? b : c)
-#define MIN_VAL (-FLT_MAX) 
+#define MIN_VAL (-FLT_MAX)
 #define MAX_VAL FLT_MAX
 #endif
 #if defined (DEPTH_6)
@@ -113,12 +113,12 @@
 #define VEC_TYPE_LOC double4
 #define CONVERT_TYPE convert_double4
 #define CONDITION_FUNC(a,b,c) ((a) ? b : c)
-#define MIN_VAL (-DBL_MAX) 
+#define MIN_VAL (-DBL_MAX)
 #define MAX_VAL DBL_MAX
 #endif

 #if defined (REPEAT_E0)
-#define repeat_e(a) a=a; 
+#define repeat_e(a) a=a;
 #endif
 #if defined (REPEAT_E1)
 #define repeat_e(a) a.s3 = a.s2;
@@ -194,7 +194,7 @@ __kernel void arithm_op_minMaxLoc_mask (int cols,int invalid_cols,int offset,int
       }
       minval = min(minval,m_temp != (VEC_TYPE)0 ? temp : minval);
       maxval = max(maxval,m_temp != (VEC_TYPE)0 ? temp : maxval);
-       
+
       minloc = CONDITION_FUNC((minval == temp) && (m_temp != (VEC_TYPE)0), temploc , minloc);
       maxloc = CONDITION_FUNC((maxval == temp) && (m_temp != (VEC_TYPE)0), temploc , maxloc);
   }
@@ -225,9 +225,9 @@ __kernel void arithm_op_minMaxLoc_mask (int cols,int invalid_cols,int offset,int
           lm_max[lid] = max(lm_max[lid] , lm_max[lid2]);
           VEC_TYPE con_min = CONVERT_TYPE(lm_minloc[lid2] != negative ? one : zero);
           VEC_TYPE con_max = CONVERT_TYPE(lm_maxloc[lid2] != negative ? one : zero);
-           lm_minloc[lid] = 
+           lm_minloc[lid] =
              CONDITION_FUNC((lm_min[lid] == lm_min[lid2]) && (con_min != (VEC_TYPE)0), lm_minloc[lid2] , lm_minloc[lid]);
-           lm_maxloc[lid] = 
+           lm_maxloc[lid] =
              CONDITION_FUNC((lm_max[lid] == lm_max[lid2]) && (con_max != (VEC_TYPE)0), lm_maxloc[lid2] , lm_maxloc[lid]);
       }
       barrier(CLK_LOCAL_MEM_FENCE);
--- a/modules/ocl/src/kernels/arithm_minMax_mask.cl
+++ b/modules/ocl/src/kernels/arithm_minMax_mask.cl
@@ -59,42 +59,42 @@
 #define VEC_TYPE char8
 #define TYPE char
 #define CONVERT_TYPE convert_char8
-#define MIN_VAL -128 
+#define MIN_VAL -128
 #define MAX_VAL 127
 #endif
 #if defined (DEPTH_2)
 #define VEC_TYPE ushort8
 #define TYPE ushort
 #define CONVERT_TYPE convert_ushort8
-#define MIN_VAL 0 
+#define MIN_VAL 0
 #define MAX_VAL 65535
 #endif
 #if defined (DEPTH_3)
 #define VEC_TYPE short8
 #define TYPE short
 #define CONVERT_TYPE convert_short8
-#define MIN_VAL -32768 
+#define MIN_VAL -32768
 #define MAX_VAL 32767
 #endif
 #if defined (DEPTH_4)
 #define VEC_TYPE int8
 #define TYPE int
 #define CONVERT_TYPE convert_int8
-#define MIN_VAL INT_MIN 
+#define MIN_VAL INT_MIN
 #define MAX_VAL INT_MAX
 #endif
 #if defined (DEPTH_5)
 #define VEC_TYPE float8
 #define TYPE float
 #define CONVERT_TYPE convert_float8
-#define MIN_VAL (-FLT_MAX) 
+#define MIN_VAL (-FLT_MAX)
 #define MAX_VAL FLT_MAX
 #endif
 #if defined (DEPTH_6)
 #define VEC_TYPE double8
 #define TYPE double
 #define CONVERT_TYPE convert_double8
-#define MIN_VAL (-DBL_MAX) 
+#define MIN_VAL (-DBL_MAX)
 #define MAX_VAL DBL_MAX
 #endif

--- a/modules/ocl/src/kernels/arithm_mul.cl
+++ b/modules/ocl/src/kernels/arithm_mul.cl
@@ -48,23 +48,23 @@
 #endif

 int4 round_int4(float4 v){
-    v.s0 = v.s0 + (v.s0 > 0 ? 0.5 : -0.5); 
-    v.s1 = v.s1 + (v.s1 > 0 ? 0.5 : -0.5); 
-    v.s2 = v.s2 + (v.s2 > 0 ? 0.5 : -0.5); 
-    v.s3 = v.s3 + (v.s3 > 0 ? 0.5 : -0.5); 
+    v.s0 = v.s0 + (v.s0 > 0 ? 0.5 : -0.5);
+    v.s1 = v.s1 + (v.s1 > 0 ? 0.5 : -0.5);
+    v.s2 = v.s2 + (v.s2 > 0 ? 0.5 : -0.5);
+    v.s3 = v.s3 + (v.s3 > 0 ? 0.5 : -0.5);

    return convert_int4_sat(v);
 }
 uint4 round_uint4(float4 v){
-    v.s0 = v.s0 + (v.s0 > 0 ? 0.5 : -0.5); 
-    v.s1 = v.s1 + (v.s1 > 0 ? 0.5 : -0.5); 
-    v.s2 = v.s2 + (v.s2 > 0 ? 0.5 : -0.5); 
-    v.s3 = v.s3 + (v.s3 > 0 ? 0.5 : -0.5); 
+    v.s0 = v.s0 + (v.s0 > 0 ? 0.5 : -0.5);
+    v.s1 = v.s1 + (v.s1 > 0 ? 0.5 : -0.5);
+    v.s2 = v.s2 + (v.s2 > 0 ? 0.5 : -0.5);
+    v.s3 = v.s3 + (v.s3 > 0 ? 0.5 : -0.5);

    return convert_uint4_sat(v);
 }
 long round_int(float v){
-    v = v + (v > 0 ? 0.5 : -0.5); 
+    v = v + (v > 0 ? 0.5 : -0.5);

    return convert_int_sat(v);
 }
@@ -85,24 +85,24 @@ __kernel void arithm_mul_D0 (__global uchar *src1, int src1_step, int src1_offse
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);

-		uchar4 src1_data ,src2_data;
+        uchar4 src1_data ,src2_data;

-		src1_data.x= src1_index+0 >= 0 ? src1[src1_index+0] : 0;
-		src1_data.y= src1_index+1 >= 0 ? src1[src1_index+1] : 0;
-		src1_data.z= src1_index+2 >= 0 ? src1[src1_index+2] : 0;
-		src1_data.w= src1_index+3 >= 0 ? src1[src1_index+3] : 0;
+        src1_data.x= src1_index+0 >= 0 ? src1[src1_index+0] : 0;
+        src1_data.y= src1_index+1 >= 0 ? src1[src1_index+1] : 0;
+        src1_data.z= src1_index+2 >= 0 ? src1[src1_index+2] : 0;
+        src1_data.w= src1_index+3 >= 0 ? src1[src1_index+3] : 0;

-		src2_data.x= src2_index+0 >= 0 ? src2[src2_index+0] : 0;
-		src2_data.y= src2_index+1 >= 0 ? src2[src2_index+1] : 0;
-		src2_data.z= src2_index+2 >= 0 ? src2[src2_index+2] : 0;
-		src2_data.w= src2_index+3 >= 0 ? src2[src2_index+3] : 0;
+        src2_data.x= src2_index+0 >= 0 ? src2[src2_index+0] : 0;
+        src2_data.y= src2_index+1 >= 0 ? src2[src2_index+1] : 0;
+        src2_data.z= src2_index+2 >= 0 ? src2[src2_index+2] : 0;
+        src2_data.w= src2_index+3 >= 0 ? src2[src2_index+3] : 0;

        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
        int4 tmp      = convert_int4_sat(src1_data) * convert_int4_sat(src2_data);
@@ -130,8 +130,8 @@ __kernel void arithm_mul_D2 (__global ushort *src1, int src1_step, int src1_offs
        x = x << 2;

        #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -166,8 +166,8 @@ __kernel void arithm_mul_D3 (__global short *src1, int src1_step, int src1_offse
        x = x << 2;

        #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
--- a/modules/ocl/src/kernels/arithm_nonzero.cl
+++ b/modules/ocl/src/kernels/arithm_nonzero.cl
@@ -137,7 +137,7 @@ __kernel void arithm_op_nonzero (int cols,int invalid_cols,int offset,int elemnu
   if(id < elemnum)
   {
       temp = src[idx];
-       if(id % cols == 0 ) 
+       if(id % cols == 0 )
       {
           repeat_s(temp);
       }
@@ -155,7 +155,7 @@ __kernel void arithm_op_nonzero (int cols,int invalid_cols,int offset,int elemnu
   {
       idx = offset + id + (id / cols) * invalid_cols;
       temp = src[idx];
-       if(id % cols == 0 ) 
+       if(id % cols == 0 )
       {
               repeat_s(temp);
       }
--- a/modules/ocl/src/kernels/arithm_phase.cl
+++ b/modules/ocl/src/kernels/arithm_phase.cl
@@ -67,7 +67,7 @@ __kernel void arithm_phase_inradians_D5 (__global float *src1, int src1_step, in
        float data1 = *((__global float *)((__global char *)src1 + src1_index));
        float data2 = *((__global float *)((__global char *)src2 + src2_index));
        float tmp = atan2(data2,data1);
-        
+
        *((__global float *)((__global char *)dst + dst_index)) = tmp;
    }

@@ -92,7 +92,7 @@ __kernel void arithm_phase_inradians_D6 (__global double *src1, int src1_step, i

        double data1 = *((__global double *)((__global char *)src1 + src1_index));
        double data2 = *((__global double *)((__global char *)src2 + src2_index));
-        
+
        *((__global double *)((__global char *)dst + dst_index)) = atan2(data2,data1);
    }

@@ -119,7 +119,7 @@ __kernel void arithm_phase_indegrees_D5 (__global float *src1, int src1_step, in
        float data2 = *((__global float *)((__global char *)src2 + src2_index));
        float tmp = atan2(data2,data1);
        float tmp_data = 180*tmp/CV_PI;
-        
+
        *((__global float *)((__global char *)dst + dst_index)) = tmp_data;
    }

@@ -146,7 +146,7 @@ __kernel void arithm_phase_indegrees_D6 (__global double *src1, int src1_step, i
        double data2 = *((__global double *)((__global char *)src2 + src2_index));
        double tmp = atan2(data2,data1);
        double tmp_data = 180*tmp/CV_PI;
-        
+
        *((__global double *)((__global char *)dst + dst_index)) = tmp_data;
    }

--- a/modules/ocl/src/kernels/arithm_polarToCart.cl
+++ b/modules/ocl/src/kernels/arithm_polarToCart.cl
@@ -54,8 +54,8 @@
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 __kernel void arithm_polarToCart_mag_D5 (__global float *src1, int src1_step, int src1_offset,//magnitue
                                         __global float *src2, int src2_step, int src2_offset,//angle
-                                         __global float *dst1, int dst1_step, int dst1_offset, 
-                                         __global float *dst2, int dst2_step, int dst2_offset, 
+                                         __global float *dst1, int dst1_step, int dst1_offset,
+                                         __global float *dst2, int dst2_step, int dst2_offset,
                                         int rows, int cols, int angInDegree)
 {
    int x = get_global_id(0);
@@ -74,7 +74,7 @@ __kernel void arithm_polarToCart_mag_D5 (__global float *src1, int src1_step, in

        float ascale = CV_PI/180.0;
        float alpha  = angInDegree == 1 ? y * ascale : y;
-        float a = cos(alpha) * x; 
+        float a = cos(alpha) * x;
        float b = sin(alpha) * x;

        *((__global float *)((__global char *)dst1 + dst1_index)) = a;
@@ -85,8 +85,8 @@ __kernel void arithm_polarToCart_mag_D5 (__global float *src1, int src1_step, in
 #if defined (DOUBLE_SUPPORT)
 __kernel void arithm_polarToCart_mag_D6 (__global double *src1, int src1_step, int src1_offset,//magnitue
                                         __global double *src2, int src2_step, int src2_offset,//angle
-                                         __global double *dst1, int dst1_step, int dst1_offset, 
-                                         __global double *dst2, int dst2_step, int dst2_offset, 
+                                         __global double *dst1, int dst1_step, int dst1_offset,
+                                         __global double *dst2, int dst2_step, int dst2_offset,
                                         int rows, int cols, int angInDegree)
 {
    int x = get_global_id(0);
@@ -105,7 +105,7 @@ __kernel void arithm_polarToCart_mag_D6 (__global double *src1, int src1_step, i

        float ascale = CV_PI/180.0;
        double alpha  = angInDegree == 1 ? y * ascale : y;
-        double a = cos(alpha) * x; 
+        double a = cos(alpha) * x;
        double b = sin(alpha) * x;

        *((__global double *)((__global char *)dst1 + dst1_index)) = a;
@@ -118,8 +118,8 @@ __kernel void arithm_polarToCart_mag_D6 (__global double *src1, int src1_step, i
 /////////////////////////////////////////polarToCart without magnitude//////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 __kernel void arithm_polarToCart_D5 (__global float *src,  int src_step,  int src_offset,//angle
-                                     __global float *dst1, int dst1_step, int dst1_offset, 
-                                     __global float *dst2, int dst2_step, int dst2_offset, 
+                                     __global float *dst1, int dst1_step, int dst1_offset,
+                                     __global float *dst2, int dst2_step, int dst2_offset,
                                     int rows, int cols, int angInDegree)
 {
    int x = get_global_id(0);
@@ -136,7 +136,7 @@ __kernel void arithm_polarToCart_D5 (__global float *src,  int src_step,  int sr

        float ascale = CV_PI/180.0;
        float alpha  = angInDegree == 1 ? y * ascale : y;
-        float a = cos(alpha); 
+        float a = cos(alpha);
        float b = sin(alpha);

        *((__global float *)((__global char *)dst1 + dst1_index)) = a;
@@ -146,8 +146,8 @@ __kernel void arithm_polarToCart_D5 (__global float *src,  int src_step,  int sr

 #if defined (DOUBLE_SUPPORT)
 __kernel void arithm_polarToCart_D6 (__global float *src,  int src_step,  int src_offset,//angle
-                                     __global float *dst1, int dst1_step, int dst1_offset, 
-                                     __global float *dst2, int dst2_step, int dst2_offset, 
+                                     __global float *dst1, int dst1_step, int dst1_offset,
+                                     __global float *dst2, int dst2_step, int dst2_offset,
                                     int rows, int cols, int angInDegree)
 {
    int x = get_global_id(0);
@@ -164,7 +164,7 @@ __kernel void arithm_polarToCart_D6 (__global float *src,  int src_step,  int sr

        float ascale = CV_PI/180.0;
        double alpha  = angInDegree == 1 ? y * ascale : y;
-        double a = cos(alpha); 
+        double a = cos(alpha);
        double b = sin(alpha);

        *((__global double *)((__global char *)dst1 + dst1_index)) = a;
--- a/modules/ocl/src/kernels/arithm_pow.cl
+++ b/modules/ocl/src/kernels/arithm_pow.cl
@@ -70,7 +70,7 @@ __kernel void arithm_pow_D5 (__global float *src1, int src1_step, int src1_offse

        float src1_data = *((__global float *)((__global char *)src1 + src1_index));
        float tmp = src1_data > 0 ? exp(p * log(src1_data)) : (src1_data == 0 ? 0 : exp(p * log(fabs(src1_data))));
-        
+
        *((__global float *)((__global char *)dst + dst_index)) = tmp;
    }

@@ -92,7 +92,7 @@ __kernel void arithm_pow_D6 (__global double *src1, int src1_step, int src1_offs
        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);

        double src1_data = *((__global double *)((__global char *)src1 + src1_index));
-        double tmp = src1_data > 0 ? exp(p * log(src1_data)) : (src1_data == 0 ? 0 : exp(p * log(fabs(src1_data)))); 
+        double tmp = src1_data > 0 ? exp(p * log(src1_data)) : (src1_data == 0 ? 0 : exp(p * log(fabs(src1_data))));
        *((__global double *)((__global char *)dst + dst_index)) = tmp;
    }

--- a/modules/ocl/src/kernels/arithm_sub.cl
+++ b/modules/ocl/src/kernels/arithm_sub.cl
@@ -64,8 +64,8 @@ __kernel void arithm_sub_D0 (__global uchar *src1, int src1_step, int src1_offse
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -99,8 +99,8 @@ __kernel void arithm_sub_D2 (__global ushort *src1, int src1_step, int src1_offs
        x = x << 2;

        #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -134,8 +134,8 @@ __kernel void arithm_sub_D3 (__global short *src1, int src1_step, int src1_offse
        x = x << 2;

        #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -240,8 +240,8 @@ __kernel void arithm_sub_with_mask_C1_D0 (__global uchar *src1, int src1_step, i
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -279,8 +279,8 @@ __kernel void arithm_sub_with_mask_C1_D2 (__global ushort *src1, int src1_step,
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -316,8 +316,8 @@ __kernel void arithm_sub_with_mask_C1_D3 (__global short *src1, int src1_step, i
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -362,7 +362,7 @@ __kernel void arithm_sub_with_mask_C1_D4 (__global int   *src1, int src1_step, i
        int dst_data  = *((__global int *)((__global char *)dst  + dst_index));

        int data = convert_int_sat((long)src_data1 - (long)src_data2);
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global int *)((__global char *)dst + dst_index)) = data;
    }
@@ -392,7 +392,7 @@ __kernel void arithm_sub_with_mask_C1_D5 (__global float *src1, int src1_step, i
        float dst_data  = *((__global float *)((__global char *)dst  + dst_index));

        float data = src_data1 - src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global float *)((__global char *)dst + dst_index)) = data;
    }
@@ -424,7 +424,7 @@ __kernel void arithm_sub_with_mask_C1_D6 (__global double *src1, int src1_step,
        double dst_data  = *((__global double *)((__global char *)dst  + dst_index));

        double data = src_data1 - src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global double *)((__global char *)dst + dst_index)) = data;
    }
@@ -446,8 +446,8 @@ __kernel void arithm_sub_with_mask_C2_D0 (__global uchar *src1, int src1_step, i
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -493,7 +493,7 @@ __kernel void arithm_sub_with_mask_C2_D2 (__global ushort *src1, int src1_step,

        int2    tmp = convert_int2_sat(src_data1) - convert_int2_sat(src_data2);
        ushort2 data = convert_ushort2_sat(tmp);
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
    }
@@ -523,7 +523,7 @@ __kernel void arithm_sub_with_mask_C2_D3 (__global short *src1, int src1_step, i

        int2    tmp = convert_int2_sat(src_data1) - convert_int2_sat(src_data2);
        short2 data = convert_short2_sat(tmp);
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global short2 *)((__global char *)dst + dst_index)) = data;
    }
@@ -552,7 +552,7 @@ __kernel void arithm_sub_with_mask_C2_D4 (__global int   *src1, int src1_step, i
        int2 dst_data  = *((__global int2 *)((__global char *)dst  + dst_index));

        int2 data = convert_int2_sat(convert_long2_sat(src_data1) - convert_long2_sat(src_data2));
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global int2 *)((__global char *)dst + dst_index)) = data;
    }
@@ -581,7 +581,7 @@ __kernel void arithm_sub_with_mask_C2_D5 (__global float *src1, int src1_step, i
        float2 dst_data  = *((__global float2 *)((__global char *)dst  + dst_index));

        float2 data = src_data1 - src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global float2 *)((__global char *)dst + dst_index)) = data;
    }
@@ -612,7 +612,7 @@ __kernel void arithm_sub_with_mask_C2_D6 (__global double *src1, int src1_step,
        double2 dst_data  = *((__global double2 *)((__global char *)dst  + dst_index));

        double2 data = src_data1 - src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global double2 *)((__global char *)dst + dst_index)) = data;
    }
@@ -633,8 +633,8 @@ __kernel void arithm_sub_with_mask_C3_D0 (__global uchar *src1, int src1_step, i
        x = x << 2;

        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
-        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); 
-        int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3)); 
+        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
+        int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -660,17 +660,17 @@ __kernel void arithm_sub_with_mask_C3_D0 (__global uchar *src1, int src1_step, i
        uchar4 tmp_data_2 = convert_uchar4_sat(convert_short4_sat(src1_data_2) - convert_short4_sat(src2_data_2));

        data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
-        data_0.w   = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+        data_0.w   = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
                     ? tmp_data_0.w : data_0.w;

-        data_1.xy  = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+        data_1.xy  = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
                     ? tmp_data_1.xy : data_1.xy;
-        data_1.zw  = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.zw  = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.zw : data_1.zw;

-        data_2.x   = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.x   = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.x : data_2.x;
-        data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) 
+        data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
                     ? tmp_data_2.yzw : data_2.yzw;

        *((__global uchar4 *)(dst + dst_index + 0)) = data_0;
@@ -693,8 +693,8 @@ __kernel void arithm_sub_with_mask_C3_D2 (__global ushort *src1, int src1_step,
        x = x << 1;

        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
-        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); 
-        int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6)); 
+        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
+        int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -721,12 +721,12 @@ __kernel void arithm_sub_with_mask_C3_D2 (__global ushort *src1, int src1_step,

        data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

-        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) 
+        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
                     ? tmp_data_1.x : data_1.x;
-        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.y : data_1.y;

-        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.xy : data_2.xy;

       *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -749,8 +749,8 @@ __kernel void arithm_sub_with_mask_C3_D3 (__global short *src1, int src1_step, i
        x = x << 1;

        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
-        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); 
-        int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6)); 
+        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
+        int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -777,12 +777,12 @@ __kernel void arithm_sub_with_mask_C3_D3 (__global short *src1, int src1_step, i

        data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

-        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) 
+        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
                     ? tmp_data_1.x : data_1.x;
-        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.y : data_1.y;

-        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.xy : data_2.xy;

       *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -802,8 +802,8 @@ __kernel void arithm_sub_with_mask_C3_D4 (__global int   *src1, int src1_step, i

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); 
-        int src2_index = mad24(y, src2_step, (x * 12) + src2_offset); 
+        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
+        int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
        int mask_index = mad24(y, mask_step, x + mask_offset);
        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));

@@ -846,15 +846,15 @@ __kernel void arithm_sub_with_mask_C3_D5 (__global float *src1, int src1_step, i

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); 
-        int src2_index = mad24(y, src2_step, (x * 12) + src2_offset); 
+        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
+        int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
        int mask_index = mad24(y, mask_step, x + mask_offset);
        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));

        float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0));
        float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4));
        float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8));
-                                             
+
        float src2_data_0 = *((__global float *)((__global char *)src2 + src2_index + 0));
        float src2_data_1 = *((__global float *)((__global char *)src2 + src2_index + 4));
        float src2_data_2 = *((__global float *)((__global char *)src2 + src2_index + 8));
@@ -892,15 +892,15 @@ __kernel void arithm_sub_with_mask_C3_D6 (__global double *src1, int src1_step,

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); 
-        int src2_index = mad24(y, src2_step, (x * 24) + src2_offset); 
+        int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
+        int src2_index = mad24(y, src2_step, (x * 24) + src2_offset);
        int mask_index = mad24(y, mask_step, x + mask_offset);
        int dst_index  = mad24(y, dst_step, dst_offset + (x * 24));

        double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 ));
        double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 ));
        double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16));
-                                               
+
        double src2_data_0 = *((__global double *)((__global char *)src2 + src2_index + 0 ));
        double src2_data_1 = *((__global double *)((__global char *)src2 + src2_index + 8 ));
        double src2_data_2 = *((__global double *)((__global char *)src2 + src2_index + 16));
@@ -949,7 +949,7 @@ __kernel void arithm_sub_with_mask_C4_D0 (__global uchar *src1, int src1_step, i
        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));

        uchar4 data = convert_uchar4_sat(convert_short4_sat(src_data1) - convert_short4_sat(src_data2));
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global uchar4 *)(dst + dst_index)) = data;
    }
@@ -978,7 +978,7 @@ __kernel void arithm_sub_with_mask_C4_D2 (__global ushort *src1, int src1_step,
        ushort4 dst_data  = *((__global ushort4 *)((__global char *)dst  + dst_index));

        ushort4 data = convert_ushort4_sat(convert_int4_sat(src_data1) - convert_int4_sat(src_data2));
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
    }
@@ -1007,7 +1007,7 @@ __kernel void arithm_sub_with_mask_C4_D3 (__global short *src1, int src1_step, i
        short4 dst_data  = *((__global short4 *)((__global char *)dst  + dst_index));

        short4 data = convert_short4_sat(convert_int4_sat(src_data1) - convert_int4_sat(src_data2));
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global short4 *)((__global char *)dst + dst_index)) = data;
    }
@@ -1036,7 +1036,7 @@ __kernel void arithm_sub_with_mask_C4_D4 (__global int   *src1, int src1_step, i
        int4 dst_data  = *((__global int4 *)((__global char *)dst  + dst_index));

        int4 data = convert_int4_sat(convert_long4_sat(src_data1) - convert_long4_sat(src_data2));
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global int4 *)((__global char *)dst + dst_index)) = data;
    }
@@ -1065,7 +1065,7 @@ __kernel void arithm_sub_with_mask_C4_D5 (__global float *src1, int src1_step, i
        float4 dst_data  = *((__global float4 *)((__global char *)dst  + dst_index));

        float4 data = src_data1 - src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global float4 *)((__global char *)dst + dst_index)) = data;
    }
@@ -1096,7 +1096,7 @@ __kernel void arithm_sub_with_mask_C4_D6 (__global double *src1, int src1_step,
        double4 dst_data  = *((__global double4 *)((__global char *)dst  + dst_index));

        double4 data = src_data1 - src_data2;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global double4 *)((__global char *)dst + dst_index)) = data;
    }
--- a/modules/ocl/src/kernels/arithm_sub_scalar.cl
+++ b/modules/ocl/src/kernels/arithm_sub_scalar.cl
@@ -59,7 +59,7 @@ __kernel void arithm_s_sub_C1_D0 (__global   uchar *src1, int src1_step, int src
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -94,7 +94,7 @@ __kernel void arithm_s_sub_C1_D2 (__global   ushort *src1, int src1_step, int sr
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -127,7 +127,7 @@ __kernel void arithm_s_sub_C1_D3 (__global   short *src1, int src1_step, int src
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -231,7 +231,7 @@ __kernel void arithm_s_sub_C2_D0 (__global   uchar *src1, int src1_step, int src
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -385,7 +385,7 @@ __kernel void arithm_s_sub_C3_D0 (__global   uchar *src1, int src1_step, int src
        x = x << 2;

        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
-        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); 
+        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -395,9 +395,9 @@ __kernel void arithm_s_sub_C3_D0 (__global   uchar *src1, int src1_step, int src
        uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
        uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);

-        int4 src2_data_0 = (int4)(src2.x, src2.y, src2.z, src2.x); 
+        int4 src2_data_0 = (int4)(src2.x, src2.y, src2.z, src2.x);
        int4 src2_data_1 = (int4)(src2.y, src2.z, src2.x, src2.y);
-        int4 src2_data_2 = (int4)(src2.z, src2.x, src2.y, src2.z); 
+        int4 src2_data_2 = (int4)(src2.z, src2.x, src2.y, src2.z);

        uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
        uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
@@ -416,17 +416,17 @@ __kernel void arithm_s_sub_C3_D0 (__global   uchar *src1, int src1_step, int src
        uchar4 tmp_data_2 = convert_uchar4_sat(tmp_2);

        data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
-        data_0.w   = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+        data_0.w   = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
                     ? tmp_data_0.w : data_0.w;

-        data_1.xy  = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+        data_1.xy  = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
                     ? tmp_data_1.xy : data_1.xy;
-        data_1.zw  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.zw  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.zw : data_1.zw;

-        data_2.x   = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.x   = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.x : data_2.x;
-        data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) 
+        data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
                     ? tmp_data_2.yzw : data_2.yzw;

        *((__global uchar4 *)(dst + dst_index + 0)) = data_0;
@@ -447,7 +447,7 @@ __kernel void arithm_s_sub_C3_D2 (__global   ushort *src1, int src1_step, int sr
        x = x << 1;

        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
-        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); 
+        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -479,12 +479,12 @@ __kernel void arithm_s_sub_C3_D2 (__global   ushort *src1, int src1_step, int sr

        data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

-        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) 
+        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
                     ? tmp_data_1.x : data_1.x;
-        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.y : data_1.y;

-        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.xy : data_2.xy;

       *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -505,7 +505,7 @@ __kernel void arithm_s_sub_C3_D3 (__global   short *src1, int src1_step, int src
        x = x << 1;

        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
-        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); 
+        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -537,12 +537,12 @@ __kernel void arithm_s_sub_C3_D3 (__global   short *src1, int src1_step, int src

        data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

-        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) 
+        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
                     ? tmp_data_1.x : data_1.x;
-        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.y : data_1.y;

-        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.xy : data_2.xy;

       *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -560,7 +560,7 @@ __kernel void arithm_s_sub_C3_D4 (__global   int *src1, int src1_step, int src1_

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); 
+        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));

        int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
@@ -602,13 +602,13 @@ __kernel void arithm_s_sub_C3_D5 (__global   float *src1, int src1_step, int src

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); 
+        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));

        float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0));
        float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4));
        float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8));
-                                             
+
        float src2_data_0 = src2.x;
        float src2_data_1 = src2.y;
        float src2_data_2 = src2.z;
@@ -642,13 +642,13 @@ __kernel void arithm_s_sub_C3_D6 (__global   double *src1, int src1_step, int sr

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); 
+        int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
        int dst_index  = mad24(y, dst_step, dst_offset + (x * 24));

        double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 ));
        double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 ));
        double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16));
-                                               
+
        double src2_data_0 = src2.x;
        double src2_data_1 = src2.y;
        double src2_data_2 = src2.z;
--- a/modules/ocl/src/kernels/arithm_sub_scalar_mask.cl
+++ b/modules/ocl/src/kernels/arithm_sub_scalar_mask.cl
@@ -62,7 +62,7 @@ __kernel void arithm_s_sub_with_mask_C1_D0 (__global   uchar *src1, int src1_ste
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -100,7 +100,7 @@ __kernel void arithm_s_sub_with_mask_C1_D2 (__global   ushort *src1, int src1_st
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -136,7 +136,7 @@ __kernel void arithm_s_sub_with_mask_C1_D3 (__global   short *src1, int src1_ste
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -182,7 +182,7 @@ __kernel void arithm_s_sub_with_mask_C1_D4 (__global   int   *src1, int src1_ste
        long tmp = (long)src_data1 - (long)src_data2;
        tmp = isMatSubScalar ? tmp : - tmp;
        int data = convert_int_sat(tmp);
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global int *)((__global char *)dst + dst_index)) = data;
    }
@@ -211,7 +211,7 @@ __kernel void arithm_s_sub_with_mask_C1_D5 (__global   float   *src1, int src1_s

        float data = src_data1 - src_data2;
        data = isMatSubScalar ? data : -data;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global float *)((__global char *)dst + dst_index)) = data;
    }
@@ -242,7 +242,7 @@ __kernel void arithm_s_sub_with_mask_C1_D6 (__global   double   *src1, int src1_

        double data = src_data1 - src_data2;
        data = isMatSubScalar ? data : -data;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global double *)((__global char *)dst + dst_index)) = data;
    }
@@ -262,7 +262,7 @@ __kernel void arithm_s_sub_with_mask_C2_D0 (__global   uchar *src1, int src1_ste
        x = x << 1;

        #define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -302,13 +302,13 @@ __kernel void arithm_s_sub_with_mask_C2_D2 (__global   ushort *src1, int src1_st
        uchar mask_data = *(mask + mask_index);

        ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = (int2)(src2.x, src2.y); 
+        int2 src_data2 = (int2)(src2.x, src2.y);
        ushort2 dst_data  = *((__global ushort2 *)((__global char *)dst  + dst_index));

        int2    tmp = convert_int2_sat(src_data1) - src_data2;
        tmp = isMatSubScalar ? tmp : -tmp;
        ushort2 data = convert_ushort2_sat(tmp);
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
    }
@@ -331,13 +331,13 @@ __kernel void arithm_s_sub_with_mask_C2_D3 (__global   short *src1, int src1_ste
        uchar mask_data = *(mask + mask_index);

        short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = (int2)(src2.x, src2.y); 
+        int2 src_data2 = (int2)(src2.x, src2.y);
        short2 dst_data  = *((__global short2 *)((__global char *)dst  + dst_index));

        int2    tmp = convert_int2_sat(src_data1) - src_data2;
        tmp = isMatSubScalar ? tmp : -tmp;
        short2 data = convert_short2_sat(tmp);
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global short2 *)((__global char *)dst + dst_index)) = data;
    }
@@ -360,13 +360,13 @@ __kernel void arithm_s_sub_with_mask_C2_D4 (__global   int *src1, int src1_step,
        uchar mask_data = *(mask + mask_index);

        int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = (int2)(src2.x, src2.y); 
+        int2 src_data2 = (int2)(src2.x, src2.y);
        int2 dst_data  = *((__global int2 *)((__global char *)dst  + dst_index));

        long2 tmp = convert_long2_sat(src_data1) - convert_long2_sat(src_data2);
        tmp = isMatSubScalar ? tmp : -tmp;
        int2 data = convert_int2_sat(tmp);
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global int2 *)((__global char *)dst + dst_index)) = data;
    }
@@ -389,12 +389,12 @@ __kernel void arithm_s_sub_with_mask_C2_D5 (__global   float *src1, int src1_ste
        uchar mask_data = *(mask + mask_index);

        float2 src_data1 = *((__global float2 *)((__global char *)src1 + src1_index));
-        float2 src_data2 = (float2)(src2.x, src2.y); 
+        float2 src_data2 = (float2)(src2.x, src2.y);
        float2 dst_data  = *((__global float2 *)((__global char *)dst  + dst_index));

        float2 data = src_data1 - src_data2;
        data = isMatSubScalar ? data : -data;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global float2 *)((__global char *)dst + dst_index)) = data;
    }
@@ -419,12 +419,12 @@ __kernel void arithm_s_sub_with_mask_C2_D6 (__global   double *src1, int src1_st
        uchar mask_data = *(mask + mask_index);

        double2 src_data1 = *((__global double2 *)((__global char *)src1 + src1_index));
-        double2 src_data2 = (double2)(src2.x, src2.y); 
+        double2 src_data2 = (double2)(src2.x, src2.y);
        double2 dst_data  = *((__global double2 *)((__global char *)dst  + dst_index));

        double2 data = src_data1 - src_data2;
        data = isMatSubScalar ? data : -data;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global double2 *)((__global char *)dst + dst_index)) = data;
    }
@@ -444,7 +444,7 @@ __kernel void arithm_s_sub_with_mask_C3_D0 (__global   uchar *src1, int src1_ste
        x = x << 2;

        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
-        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); 
+        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -455,9 +455,9 @@ __kernel void arithm_s_sub_with_mask_C3_D0 (__global   uchar *src1, int src1_ste
        uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
        uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);

-        int4 src2_data_0 = (int4)(src2.x, src2.y, src2.z, src2.x); 
+        int4 src2_data_0 = (int4)(src2.x, src2.y, src2.z, src2.x);
        int4 src2_data_1 = (int4)(src2.y, src2.z, src2.x, src2.y);
-        int4 src2_data_2 = (int4)(src2.z, src2.x, src2.y, src2.z); 
+        int4 src2_data_2 = (int4)(src2.z, src2.x, src2.y, src2.z);

        uchar4 mask_data = vload4(0, mask + mask_index);

@@ -478,17 +478,17 @@ __kernel void arithm_s_sub_with_mask_C3_D0 (__global   uchar *src1, int src1_ste
        uchar4 tmp_data_2 = convert_uchar4_sat(tmp_2);

        data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
-        data_0.w   = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+        data_0.w   = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
                     ? tmp_data_0.w : data_0.w;

-        data_1.xy  = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+        data_1.xy  = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
                     ? tmp_data_1.xy : data_1.xy;
-        data_1.zw  = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.zw  = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.zw : data_1.zw;

-        data_2.x   = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.x   = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.x : data_2.x;
-        data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) 
+        data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
                     ? tmp_data_2.yzw : data_2.yzw;

        *((__global uchar4 *)(dst + dst_index + 0)) = data_0;
@@ -510,7 +510,7 @@ __kernel void arithm_s_sub_with_mask_C3_D2 (__global   ushort *src1, int src1_st
        x = x << 1;

        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
-        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); 
+        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -521,9 +521,9 @@ __kernel void arithm_s_sub_with_mask_C3_D2 (__global   ushort *src1, int src1_st
        ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4));
        ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8));

-        int2 src2_data_0 = (int2)(src2.x, src2.y); 
-        int2 src2_data_1 = (int2)(src2.z, src2.x); 
-        int2 src2_data_2 = (int2)(src2.y, src2.z); 
+        int2 src2_data_0 = (int2)(src2.x, src2.y);
+        int2 src2_data_1 = (int2)(src2.z, src2.x);
+        int2 src2_data_2 = (int2)(src2.y, src2.z);

        uchar2 mask_data = vload2(0, mask + mask_index);

@@ -545,12 +545,12 @@ __kernel void arithm_s_sub_with_mask_C3_D2 (__global   ushort *src1, int src1_st

        data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

-        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) 
+        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
                     ? tmp_data_1.x : data_1.x;
-        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.y : data_1.y;

-        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.xy : data_2.xy;

       *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -572,7 +572,7 @@ __kernel void arithm_s_sub_with_mask_C3_D3 (__global   short *src1, int src1_ste
        x = x << 1;

        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
-        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); 
+        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -583,9 +583,9 @@ __kernel void arithm_s_sub_with_mask_C3_D3 (__global   short *src1, int src1_ste
        short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4));
        short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8));

-        int2 src2_data_0 = (int2)(src2.x, src2.y); 
-        int2 src2_data_1 = (int2)(src2.z, src2.x); 
-        int2 src2_data_2 = (int2)(src2.y, src2.z); 
+        int2 src2_data_0 = (int2)(src2.x, src2.y);
+        int2 src2_data_1 = (int2)(src2.z, src2.x);
+        int2 src2_data_2 = (int2)(src2.y, src2.z);

        uchar2 mask_data = vload2(0, mask + mask_index);

@@ -607,12 +607,12 @@ __kernel void arithm_s_sub_with_mask_C3_D3 (__global   short *src1, int src1_ste

        data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

-        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) 
+        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
                     ? tmp_data_1.x : data_1.x;
-        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_1.y : data_1.y;

-        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
                     ? tmp_data_2.xy : data_2.xy;

       *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -631,7 +631,7 @@ __kernel void arithm_s_sub_with_mask_C3_D4 (__global   int *src1, int src1_step,

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); 
+        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
        int mask_index = mad24(y, mask_step, x + mask_offset);
        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));

@@ -639,9 +639,9 @@ __kernel void arithm_s_sub_with_mask_C3_D4 (__global   int *src1, int src1_step,
        int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4));
        int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8));

-        int src2_data_0 = src2.x; 
+        int src2_data_0 = src2.x;
        int src2_data_1 = src2.y;
-        int src2_data_2 = src2.z; 
+        int src2_data_2 = src2.z;

        uchar mask_data = * (mask + mask_index);

@@ -652,7 +652,7 @@ __kernel void arithm_s_sub_with_mask_C3_D4 (__global   int *src1, int src1_step,
        long tmp_0 = (long)src1_data_0 - (long)src2_data_0;
        long tmp_1 = (long)src1_data_1 - (long)src2_data_1;
        long tmp_2 = (long)src1_data_2 - (long)src2_data_2;
-        
+
        tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0;
        tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1;
        tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2;
@@ -681,17 +681,17 @@ __kernel void arithm_s_sub_with_mask_C3_D5 (__global   float *src1, int src1_ste

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); 
+        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
        int mask_index = mad24(y, mask_step, x + mask_offset);
        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));

        float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0));
        float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4));
        float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8));
-                                             
-        float src2_data_0 = src2.x; 
+
+        float src2_data_0 = src2.x;
        float src2_data_1 = src2.y;
-        float src2_data_2 = src2.z; 
+        float src2_data_2 = src2.z;

        uchar mask_data = * (mask + mask_index);

@@ -729,17 +729,17 @@ __kernel void arithm_s_sub_with_mask_C3_D6 (__global   double *src1, int src1_st

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); 
+        int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
        int mask_index = mad24(y, mask_step, x + mask_offset);
        int dst_index  = mad24(y, dst_step, dst_offset + (x * 24));

        double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 ));
        double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 ));
        double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16));
-                                               
-        double src2_data_0 = src2.x; 
+
+        double src2_data_0 = src2.x;
        double src2_data_1 = src2.y;
-        double src2_data_2 = src2.z; 
+        double src2_data_2 = src2.z;

        uchar mask_data = * (mask + mask_index);

@@ -789,7 +789,7 @@ __kernel void arithm_s_sub_with_mask_C4_D0 (__global   uchar *src1, int src1_ste
        tmp = isMatSubScalar ? tmp : -tmp;
        uchar4 data = convert_uchar4_sat(tmp);

-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global uchar4 *)(dst + dst_index)) = data;
    }
@@ -818,7 +818,7 @@ __kernel void arithm_s_sub_with_mask_C4_D2 (__global   ushort *src1, int src1_st
        tmp = isMatSubScalar ? tmp : -tmp;
        ushort4 data = convert_ushort4_sat(tmp);

-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
    }
@@ -847,7 +847,7 @@ __kernel void arithm_s_sub_with_mask_C4_D3 (__global   short *src1, int src1_ste
        tmp = isMatSubScalar ? tmp : -tmp;
        short4 data = convert_short4_sat(tmp);

-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global short4 *)((__global char *)dst + dst_index)) = data;
    }
@@ -876,7 +876,7 @@ __kernel void arithm_s_sub_with_mask_C4_D4 (__global   int *src1, int src1_step,
        tmp = isMatSubScalar ? tmp : -tmp;
        int4 data = convert_int4_sat(tmp);

-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global int4 *)((__global char *)dst + dst_index)) = data;
    }
@@ -904,7 +904,7 @@ __kernel void arithm_s_sub_with_mask_C4_D5 (__global   float *src1, int src1_ste
        float4 data = src_data1 - src2;
        data = isMatSubScalar ? data : -data;

-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global float4 *)((__global char *)dst + dst_index)) = data;
    }
@@ -933,7 +933,7 @@ __kernel void arithm_s_sub_with_mask_C4_D6 (__global   double *src1, int src1_st

        double4 data = src_data1 - src2;
        data = isMatSubScalar ? data : -data;
-        data = mask_data ? data : dst_data; 
+        data = mask_data ? data : dst_data;

        *((__global double4 *)((__global char *)dst + dst_index)) = data;
    }
--- a/modules/ocl/src/kernels/arithm_sum.cl
+++ b/modules/ocl/src/kernels/arithm_sum.cl
@@ -151,7 +151,7 @@ __kernel void arithm_op_sum (int cols,int invalid_cols,int offset,int elemnum,in
   if(id < elemnum)
   {
       temp = CONVERT_RES_TYPE(src[idx]);
-       if(id % cols == 0 ) 
+       if(id % cols == 0 )
       {
           repeat_s(temp);
       }
@@ -169,7 +169,7 @@ __kernel void arithm_op_sum (int cols,int invalid_cols,int offset,int elemnum,in
   {
       idx = offset + id + (id / cols) * invalid_cols;
       temp = CONVERT_RES_TYPE(src[idx]);
-       if(id % cols == 0 ) 
+       if(id % cols == 0 )
       {
               repeat_s(temp);
       }
--- a/modules/ocl/src/kernels/arithm_sum_3.cl
+++ b/modules/ocl/src/kernels/arithm_sum_3.cl
@@ -159,7 +159,7 @@
 #define repeat_e(a,b,c) a.s3=0; a.s2=0; a.s1=0; b=0; c=0;
 #endif

-__kernel void arithm_op_sum_3 (int cols,int invalid_cols,int offset,int elemnum,int groupnum,  
+__kernel void arithm_op_sum_3 (int cols,int invalid_cols,int offset,int elemnum,int groupnum,
                                __global VEC_TYPE *src, __global RES_TYPE *dst)
 {
   unsigned int lid = get_local_id(0);
@@ -176,7 +176,7 @@ __kernel void arithm_op_sum_3 (int cols,int invalid_cols,int offset,int elemnum,
       temp1 = CONVERT_RES_TYPE(src[idx]);
       temp2 = CONVERT_RES_TYPE(src[idx+1]);
       temp3 = CONVERT_RES_TYPE(src[idx+2]);
-       if(id % cols == 0 ) 
+       if(id % cols == 0 )
       {
           repeat_s(temp1,temp2,temp3);
       }
@@ -201,7 +201,7 @@ __kernel void arithm_op_sum_3 (int cols,int invalid_cols,int offset,int elemnum,
       temp1 = CONVERT_RES_TYPE(src[idx]);
       temp2 = CONVERT_RES_TYPE(src[idx+1]);
       temp3 = CONVERT_RES_TYPE(src[idx+2]);
-       if(id % cols == 0 ) 
+       if(id % cols == 0 )
       {
               repeat_s(temp1,temp2,temp3);
       }
--- a/modules/ocl/src/kernels/arithm_transpose.cl
+++ b/modules/ocl/src/kernels/arithm_transpose.cl
@@ -43,14 +43,14 @@
 //
 //M*/

-#define TILE_DIM      32 
-#define BLOCK_ROWS    8 
+#define TILE_DIM      32
+#define BLOCK_ROWS    8
 #define LDS_STEP     (TILE_DIM + 1)


-//8UC1 is not unoptimized, as the size of write per thread is 8 
+//8UC1 is not unoptimized, as the size of write per thread is 8
 //which will use completepath
-__kernel void transpose_C1_D0(__global uchar* src, int src_step, int src_offset, 
+__kernel void transpose_C1_D0(__global uchar* src, int src_step, int src_offset,
                              __global uchar* dst, int dst_step, int dst_offset,
                              int src_rows, int src_cols)
 {
@@ -62,13 +62,13 @@ __kernel void transpose_C1_D0(__global uchar* src, int src_step, int src_offset,

    if(src_rows == src_cols)
    {
-        groupId_y = gp_x;  
+        groupId_y = gp_x;
        groupId_x = (gp_x + gp_y) % gs_x;
    }
    else
    {
-        int bid = gp_x + gs_x * gp_y; 
-        groupId_y =  bid % gs_y;  
+        int bid = gp_x + gs_x * gp_y;
+        groupId_y =  bid % gs_y;
        groupId_x = ((bid / gs_y) + groupId_y) % gs_x;
    }

@@ -87,7 +87,7 @@ __kernel void transpose_C1_D0(__global uchar* src, int src_step, int src_offset,
    {
        int index_src = mad24(y, src_step, x);

-        #pragma unroll 
+        #pragma unroll
        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
        {
            if(y + i < src_rows)
@@ -109,14 +109,14 @@ __kernel void transpose_C1_D0(__global uchar* src, int src_step, int src_offset,
        {
            if((y_index + i) < src_cols)
            {
-                *(dst + dst_offset + index_dst ) = title[lx * LDS_STEP + ly + i];  
+                *(dst + dst_offset + index_dst ) = title[lx * LDS_STEP + ly + i];
                index_dst +=  dst_step * BLOCK_ROWS ;
            }
        }
    }
 }

-__kernel void transpose_C1_D4(__global int* src, int src_step, int src_offset, 
+__kernel void transpose_C1_D4(__global int* src, int src_step, int src_offset,
                              __global int* dst, int dst_step, int dst_offset,
                              int src_rows, int src_cols)
 {
@@ -128,13 +128,13 @@ __kernel void transpose_C1_D4(__global int* src, int src_step, int src_offset,

    if(src_rows == src_cols)
    {
-        groupId_y = gp_x;  
+        groupId_y = gp_x;
        groupId_x = (gp_x + gp_y) % gs_x;
    }
    else
    {
-        int bid = gp_x + gs_x * gp_y; 
-        groupId_y =  bid % gs_y;  
+        int bid = gp_x + gs_x * gp_y;
+        groupId_y =  bid % gs_y;
        groupId_x = ((bid / gs_y) + groupId_y) % gs_x;
    }

@@ -153,7 +153,7 @@ __kernel void transpose_C1_D4(__global int* src, int src_step, int src_offset,
    {
        int index_src = mad24(y, src_step, (x << 2));

-        #pragma unroll 
+        #pragma unroll
        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
        {
            if(y + i < src_rows)
@@ -175,13 +175,13 @@ __kernel void transpose_C1_D4(__global int* src, int src_step, int src_offset,
        {
            if((y_index + i) < src_cols)
            {
-                *((__global int*)((__global char*)dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];  
+                *((__global int*)((__global char*)dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];
                index_dst +=  dst_step * BLOCK_ROWS ;
            }
        }
    }
 }
-__kernel void transpose_C1_D5(__global float* src, int src_step, int src_offset, 
+__kernel void transpose_C1_D5(__global float* src, int src_step, int src_offset,
                              __global float* dst, int dst_step, int dst_offset,
                              int src_rows, int src_cols)
 {
@@ -193,13 +193,13 @@ __kernel void transpose_C1_D5(__global float* src, int src_step, int src_offset,

    if(src_rows == src_cols)
    {
-        groupId_y = gp_x;  
+        groupId_y = gp_x;
        groupId_x = (gp_x + gp_y) % gs_x;
    }
    else
    {
-        int bid = gp_x + gs_x * gp_y; 
-        groupId_y =  bid % gs_y;  
+        int bid = gp_x + gs_x * gp_y;
+        groupId_y =  bid % gs_y;
        groupId_x = ((bid / gs_y) + groupId_y) % gs_x;
    }

@@ -218,7 +218,7 @@ __kernel void transpose_C1_D5(__global float* src, int src_step, int src_offset,
    {
        int index_src = mad24(y, src_step, (x << 2));

-        #pragma unroll 
+        #pragma unroll
        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
        {
            if(y + i < src_rows)
@@ -240,14 +240,14 @@ __kernel void transpose_C1_D5(__global float* src, int src_step, int src_offset,
        {
            if((y_index + i) < src_cols)
            {
-                *((__global float*)((__global char*)dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];  
+                *((__global float*)((__global char*)dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];
                index_dst +=  dst_step * BLOCK_ROWS ;
            }
        }
    }
 }

-__kernel void transpose_C2_D2(__global ushort* src, int src_step, int src_offset, 
+__kernel void transpose_C2_D2(__global ushort* src, int src_step, int src_offset,
                              __global ushort* dst, int dst_step, int dst_offset,
                              int src_rows, int src_cols)
 {
@@ -259,13 +259,13 @@ __kernel void transpose_C2_D2(__global ushort* src, int src_step, int src_offset

    if(src_rows == src_cols)
    {
-        groupId_y = gp_x;  
+        groupId_y = gp_x;
        groupId_x = (gp_x + gp_y) % gs_x;
    }
    else
    {
-        int bid = gp_x + gs_x * gp_y; 
-        groupId_y =  bid % gs_y;  
+        int bid = gp_x + gs_x * gp_y;
+        groupId_y =  bid % gs_y;
        groupId_x = ((bid / gs_y) + groupId_y) % gs_x;
    }

@@ -284,7 +284,7 @@ __kernel void transpose_C2_D2(__global ushort* src, int src_step, int src_offset
    {
        int index_src = mad24(y, src_step, (x << 2));

-        #pragma unroll 
+        #pragma unroll
        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
        {
            if(y + i < src_rows)
@@ -306,13 +306,13 @@ __kernel void transpose_C2_D2(__global ushort* src, int src_step, int src_offset
        {
            if((y_index + i) < src_cols)
            {
-                *((__global ushort2*)((__global char*)dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];  
+                *((__global ushort2*)((__global char*)dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];
                index_dst +=  dst_step * BLOCK_ROWS ;
            }
        }
    }
 }
-__kernel void transpose_C2_D3(__global short* src, int src_step, int src_offset, 
+__kernel void transpose_C2_D3(__global short* src, int src_step, int src_offset,
                              __global short* dst, int dst_step, int dst_offset,
                              int src_rows, int src_cols)
 {
@@ -324,13 +324,13 @@ __kernel void transpose_C2_D3(__global short* src, int src_step, int src_offset,

    if(src_rows == src_cols)
    {
-        groupId_y = gp_x;  
+        groupId_y = gp_x;
        groupId_x = (gp_x + gp_y) % gs_x;
    }
    else
    {
-        int bid = gp_x + gs_x * gp_y; 
-        groupId_y =  bid % gs_y;  
+        int bid = gp_x + gs_x * gp_y;
+        groupId_y =  bid % gs_y;
        groupId_x = ((bid / gs_y) + groupId_y) % gs_x;
    }

@@ -349,7 +349,7 @@ __kernel void transpose_C2_D3(__global short* src, int src_step, int src_offset,
    {
        int index_src = mad24(y, src_step, (x << 2));

-        #pragma unroll 
+        #pragma unroll
        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
        {
            if(y + i < src_rows)
@@ -371,13 +371,13 @@ __kernel void transpose_C2_D3(__global short* src, int src_step, int src_offset,
        {
            if((y_index + i) < src_cols)
            {
-                *((__global short2*)((__global char*)dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];  
+                *((__global short2*)((__global char*)dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];
                index_dst +=  dst_step * BLOCK_ROWS ;
            }
        }
    }
 }
-__kernel void transpose_C4_D0(__global uchar* src, int src_step, int src_offset, 
+__kernel void transpose_C4_D0(__global uchar* src, int src_step, int src_offset,
                              __global uchar* dst, int dst_step, int dst_offset,
                              int src_rows, int src_cols)
 {
@@ -389,13 +389,13 @@ __kernel void transpose_C4_D0(__global uchar* src, int src_step, int src_offset,

    if(src_rows == src_cols)
    {
-        groupId_y = gp_x;  
+        groupId_y = gp_x;
        groupId_x = (gp_x + gp_y) % gs_x;
    }
    else
    {
-        int bid = gp_x + gs_x * gp_y; 
-        groupId_y =  bid % gs_y;  
+        int bid = gp_x + gs_x * gp_y;
+        groupId_y =  bid % gs_y;
        groupId_x = ((bid / gs_y) + groupId_y) % gs_x;
    }

@@ -414,7 +414,7 @@ __kernel void transpose_C4_D0(__global uchar* src, int src_step, int src_offset,
    {
        int index_src = mad24(y, src_step, (x << 2));

-        #pragma unroll 
+        #pragma unroll
        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
        {
            if(y + i < src_rows)
@@ -436,14 +436,14 @@ __kernel void transpose_C4_D0(__global uchar* src, int src_step, int src_offset,
        {
            if((y_index + i) < src_cols)
            {
-                *((__global uchar4*)(dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];  
+                *((__global uchar4*)(dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];
                index_dst +=  dst_step * BLOCK_ROWS ;
            }
        }
    }
 }

-__kernel void transpose_C4_D1(__global char* src, int src_step, int src_offset, 
+__kernel void transpose_C4_D1(__global char* src, int src_step, int src_offset,
                              __global char* dst, int dst_step, int dst_offset,
                              int src_rows, int src_cols)
 {
@@ -455,13 +455,13 @@ __kernel void transpose_C4_D1(__global char* src, int src_step, int src_offset,

    if(src_rows == src_cols)
    {
-        groupId_y = gp_x;  
+        groupId_y = gp_x;
        groupId_x = (gp_x + gp_y) % gs_x;
    }
    else
    {
-        int bid = gp_x + gs_x * gp_y; 
-        groupId_y =  bid % gs_y;  
+        int bid = gp_x + gs_x * gp_y;
+        groupId_y =  bid % gs_y;
        groupId_x = ((bid / gs_y) + groupId_y) % gs_x;
    }

@@ -480,7 +480,7 @@ __kernel void transpose_C4_D1(__global char* src, int src_step, int src_offset,
    {
        int index_src = mad24(y, src_step, (x << 2));

-        #pragma unroll 
+        #pragma unroll
        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
        {
            if(y + i < src_rows)
@@ -502,7 +502,7 @@ __kernel void transpose_C4_D1(__global char* src, int src_step, int src_offset,
        {
            if((y_index + i) < src_cols)
            {
-                *((__global char4*)(dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];  
+                *((__global char4*)(dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];
                index_dst +=  dst_step * BLOCK_ROWS ;
            }
        }
--- a/modules/ocl/src/kernels/blend_linear.cl
+++ b/modules/ocl/src/kernels/blend_linear.cl
@@ -15,7 +15,7 @@
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//    Liu Liujun, liujun@multicorewareinc.com 
+//    Liu Liujun, liujun@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -43,103 +43,103 @@
 //
 //M*/
 __kernel void BlendLinear_C1_D0(
-	__global uchar *dst,
-	__global uchar *img1,
-	__global uchar *img2,
-	__global float *weight1,
-	__global float *weight2,
-	int rows,
-	int cols,
-	int istep,
-	int wstep
-	)
+    __global uchar *dst,
+    __global uchar *img1,
+    __global uchar *img2,
+    __global float *weight1,
+    __global float *weight2,
+    int rows,
+    int cols,
+    int istep,
+    int wstep
+    )
 {
-	int idx = get_global_id(0);
-	int idy = get_global_id(1);
-	if (idx < cols && idy < rows)
-	{
-		int pos = mad24(idy,istep,idx);
-		int wpos = mad24(idy,wstep,idx);
-		float w1 = weight1[wpos];
-		float w2 = weight2[wpos];
-		dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
+    int idx = get_global_id(0);
+    int idy = get_global_id(1);
+    if (idx < cols && idy < rows)
+    {
+        int pos = mad24(idy,istep,idx);
+        int wpos = mad24(idy,wstep,idx);
+        float w1 = weight1[wpos];
+        float w2 = weight2[wpos];
+        dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);

-	}
+    }
 }

 __kernel void BlendLinear_C4_D0(
-	__global uchar *dst,
-	__global uchar *img1,
-	__global uchar *img2,
-	__global float *weight1,
-	__global float *weight2,
-	int rows,
-	int cols,
-	int istep,
-	int wstep
-	)
+    __global uchar *dst,
+    __global uchar *img1,
+    __global uchar *img2,
+    __global float *weight1,
+    __global float *weight2,
+    int rows,
+    int cols,
+    int istep,
+    int wstep
+    )
 {
-	int idx = get_global_id(0);
-	int idy = get_global_id(1);
-	int x = idx / 4;
-	int y = idy;
-	if (x < cols && y < rows)
-	{
-		int pos = mad24(idy,istep,idx);
-		int wpos = mad24(idy,wstep,x);
-		float w1 = weight1[wpos];
-		float w2 = weight2[wpos];
-		dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
-	}
+    int idx = get_global_id(0);
+    int idy = get_global_id(1);
+    int x = idx / 4;
+    int y = idy;
+    if (x < cols && y < rows)
+    {
+        int pos = mad24(idy,istep,idx);
+        int wpos = mad24(idy,wstep,x);
+        float w1 = weight1[wpos];
+        float w2 = weight2[wpos];
+        dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
+    }
 }

 __kernel void BlendLinear_C1_D5(
-	__global float *dst,
-	__global float *img1,
-	__global float *img2,
-	__global float *weight1,
-	__global float *weight2,
-	int rows,
-	int cols,
-	int istep,
-	int wstep
-	)
+    __global float *dst,
+    __global float *img1,
+    __global float *img2,
+    __global float *weight1,
+    __global float *weight2,
+    int rows,
+    int cols,
+    int istep,
+    int wstep
+    )
 {
-	int idx = get_global_id(0);
-	int idy = get_global_id(1);
-	if (idx < cols && idy < rows)
-	{
-		int pos = mad24(idy,istep,idx);
-		int wpos = mad24(idy,wstep,idx);
-		float w1 = weight1[wpos];
-		float w2 = weight2[wpos];
-		dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
-	}
+    int idx = get_global_id(0);
+    int idy = get_global_id(1);
+    if (idx < cols && idy < rows)
+    {
+        int pos = mad24(idy,istep,idx);
+        int wpos = mad24(idy,wstep,idx);
+        float w1 = weight1[wpos];
+        float w2 = weight2[wpos];
+        dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
+    }
 }

 __kernel void BlendLinear_C4_D5(
-	__global float *dst,
-	__global float *img1,
-	__global float *img2,
-	__global float *weight1,
-	__global float *weight2,
-	int rows,
-	int cols,
-	int istep,
-	int wstep
-	)
+    __global float *dst,
+    __global float *img1,
+    __global float *img2,
+    __global float *weight1,
+    __global float *weight2,
+    int rows,
+    int cols,
+    int istep,
+    int wstep
+    )
 {
-	int idx = get_global_id(0);
-	int idy = get_global_id(1);
-	int x = idx / 4;
-	int y = idy;
-	if (x < cols && y < rows)
-	{
-		int pos = mad24(idy,istep,idx);
-		int wpos = mad24(idy,wstep,x);
-		float w1 = weight1[wpos];
-		float w2 = weight2[wpos];
-		dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
-	}
+    int idx = get_global_id(0);
+    int idy = get_global_id(1);
+    int x = idx / 4;
+    int y = idy;
+    if (x < cols && y < rows)
+    {
+        int pos = mad24(idy,istep,idx);
+        int wpos = mad24(idy,wstep,x);
+        float w1 = weight1[wpos];
+        float w2 = weight2[wpos];
+        dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
+    }
 }

--- a/modules/ocl/src/kernels/brute_force_match.cl
+++ b/modules/ocl/src/kernels/brute_force_match.cl
--- a/modules/ocl/src/kernels/build_warps.cl
+++ b/modules/ocl/src/kernels/build_warps.cl
@@ -1,237 +1,237 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Peng Xiao, pengxiao@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other oclMaterials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-__kernel
-    void buildWarpPlaneMaps
-    (
-    __global float * map_x,
-    __global float * map_y,
-    __constant float * KRT,
-    int tl_u,
-    int tl_v,
-    int cols,
-    int rows,
-    int step_x,
-    int step_y,
-    float scale
-    )
-{
-    int du = get_global_id(0);
-    int dv = get_global_id(1);
-    step_x /= sizeof(float);
-    step_y /= sizeof(float);
-
-    __constant float * ck_rinv = KRT;
-    __constant float * ct      = KRT + 9;
-
-    if (du < cols && dv < rows)
-    {
-        float u = tl_u + du;
-        float v = tl_v + dv;
-        float x, y;
-
-        float x_ = u / scale - ct[0];
-        float y_ = v / scale - ct[1];
-
-        float z;
-        x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * (1 - ct[2]);
-        y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * (1 - ct[2]);
-        z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * (1 - ct[2]);
-
-        x /= z;
-        y /= z;
-
-        map_x[dv * step_x + du] = x;
-        map_y[dv * step_y + du] = y;
-    }
-}
-
-__kernel
-    void buildWarpCylindricalMaps
-    (
-    __global float * map_x,
-    __global float * map_y,
-    __constant float * ck_rinv,
-    int tl_u,
-    int tl_v,
-    int cols,
-    int rows,
-    int step_x,
-    int step_y,
-    float scale
-    )
-{
-    int du = get_global_id(0);
-    int dv = get_global_id(1);
-    step_x /= sizeof(float);
-    step_y /= sizeof(float);
-
-    if (du < cols && dv < rows)
-    {
-        float u = tl_u + du;
-        float v = tl_v + dv;
-        float x, y;
-
-        u /= scale;
-        float x_ = sin(u);
-        float y_ = v / scale;
-        float z_ = cos(u);
-
-        float z;
-        x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * z_;
-        y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * z_;
-        z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * z_;
-
-        if (z > 0) { x /= z; y /= z; }
-        else x = y = -1;
-
-        map_x[dv * step_x + du] = x;
-        map_y[dv * step_y + du] = y;
-    }
-}
-
-__kernel
-    void buildWarpSphericalMaps
-    (
-    __global float * map_x,
-    __global float * map_y,
-    __constant float * ck_rinv,
-    int tl_u,
-    int tl_v,
-    int cols,
-    int rows,
-    int step_x,
-    int step_y,
-    float scale
-    )
-{
-    int du = get_global_id(0);
-    int dv = get_global_id(1);
-    step_x /= sizeof(float);
-    step_y /= sizeof(float);
-
-    if (du < cols && dv < rows)
-    {
-        float u = tl_u + du;
-        float v = tl_v + dv;
-        float x, y;
-
-        v /= scale;
-        u /= scale;
-
-        float sinv = sin(v);
-        float x_ = sinv * sin(u);
-        float y_ = - cos(v);
-        float z_ = sinv * cos(u);
-
-        float z;
-        x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * z_;
-        y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * z_;
-        z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * z_;
-
-        if (z > 0) { x /= z; y /= z; }
-        else x = y = -1;
-
-        map_x[dv * step_x + du] = x;
-        map_y[dv * step_y + du] = y;
-    }
-}
-
-__kernel
-    void buildWarpAffineMaps
-    (
-    __global float * xmap,
-    __global float * ymap,
-    __constant float * c_warpMat,
-    int cols,
-    int rows,
-    int step_x,
-    int step_y
-    )
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-    step_x /= sizeof(float);
-    step_y /= sizeof(float);
-
-    if (x < cols && y < rows)
-    {
-        const float xcoo = c_warpMat[0] * x + c_warpMat[1] * y + c_warpMat[2];
-        const float ycoo = c_warpMat[3] * x + c_warpMat[4] * y + c_warpMat[5];
-
-        map_x[y * step_x + x] = xcoo;
-        map_y[y * step_y + x] = ycoo;
-    }
-}
-
-__kernel
-    void buildWarpPerspectiveMaps
-    (
-    __global float * xmap,
-    __global float * ymap,
-    __constant float * c_warpMat,
-    int cols,
-    int rows,
-    int step_x,
-    int step_y
-    )
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-    step_x /= sizeof(float);
-    step_y /= sizeof(float);
-
-    if (x < cols && y < rows)
-    {
-        const float coeff = 1.0f / (c_warpMat[6] * x + c_warpMat[7] * y + c_warpMat[8]);
-
-        const float xcoo = coeff * (c_warpMat[0] * x + c_warpMat[1] * y + c_warpMat[2]);
-        const float ycoo = coeff * (c_warpMat[3] * x + c_warpMat[4] * y + c_warpMat[5]);
-
-        map_x[y * step_x + x] = xcoo;
-        map_y[y * step_y + x] = ycoo;
-    }
-}
-
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Peng Xiao, pengxiao@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+__kernel
+    void buildWarpPlaneMaps
+    (
+    __global float * map_x,
+    __global float * map_y,
+    __constant float * KRT,
+    int tl_u,
+    int tl_v,
+    int cols,
+    int rows,
+    int step_x,
+    int step_y,
+    float scale
+    )
+{
+    int du = get_global_id(0);
+    int dv = get_global_id(1);
+    step_x /= sizeof(float);
+    step_y /= sizeof(float);
+
+    __constant float * ck_rinv = KRT;
+    __constant float * ct      = KRT + 9;
+
+    if (du < cols && dv < rows)
+    {
+        float u = tl_u + du;
+        float v = tl_v + dv;
+        float x, y;
+
+        float x_ = u / scale - ct[0];
+        float y_ = v / scale - ct[1];
+
+        float z;
+        x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * (1 - ct[2]);
+        y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * (1 - ct[2]);
+        z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * (1 - ct[2]);
+
+        x /= z;
+        y /= z;
+
+        map_x[dv * step_x + du] = x;
+        map_y[dv * step_y + du] = y;
+    }
+}
+
+__kernel
+    void buildWarpCylindricalMaps
+    (
+    __global float * map_x,
+    __global float * map_y,
+    __constant float * ck_rinv,
+    int tl_u,
+    int tl_v,
+    int cols,
+    int rows,
+    int step_x,
+    int step_y,
+    float scale
+    )
+{
+    int du = get_global_id(0);
+    int dv = get_global_id(1);
+    step_x /= sizeof(float);
+    step_y /= sizeof(float);
+
+    if (du < cols && dv < rows)
+    {
+        float u = tl_u + du;
+        float v = tl_v + dv;
+        float x, y;
+
+        u /= scale;
+        float x_ = sin(u);
+        float y_ = v / scale;
+        float z_ = cos(u);
+
+        float z;
+        x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * z_;
+        y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * z_;
+        z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * z_;
+
+        if (z > 0) { x /= z; y /= z; }
+        else x = y = -1;
+
+        map_x[dv * step_x + du] = x;
+        map_y[dv * step_y + du] = y;
+    }
+}
+
+__kernel
+    void buildWarpSphericalMaps
+    (
+    __global float * map_x,
+    __global float * map_y,
+    __constant float * ck_rinv,
+    int tl_u,
+    int tl_v,
+    int cols,
+    int rows,
+    int step_x,
+    int step_y,
+    float scale
+    )
+{
+    int du = get_global_id(0);
+    int dv = get_global_id(1);
+    step_x /= sizeof(float);
+    step_y /= sizeof(float);
+
+    if (du < cols && dv < rows)
+    {
+        float u = tl_u + du;
+        float v = tl_v + dv;
+        float x, y;
+
+        v /= scale;
+        u /= scale;
+
+        float sinv = sin(v);
+        float x_ = sinv * sin(u);
+        float y_ = - cos(v);
+        float z_ = sinv * cos(u);
+
+        float z;
+        x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * z_;
+        y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * z_;
+        z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * z_;
+
+        if (z > 0) { x /= z; y /= z; }
+        else x = y = -1;
+
+        map_x[dv * step_x + du] = x;
+        map_y[dv * step_y + du] = y;
+    }
+}
+
+__kernel
+    void buildWarpAffineMaps
+    (
+    __global float * xmap,
+    __global float * ymap,
+    __constant float * c_warpMat,
+    int cols,
+    int rows,
+    int step_x,
+    int step_y
+    )
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    step_x /= sizeof(float);
+    step_y /= sizeof(float);
+
+    if (x < cols && y < rows)
+    {
+        const float xcoo = c_warpMat[0] * x + c_warpMat[1] * y + c_warpMat[2];
+        const float ycoo = c_warpMat[3] * x + c_warpMat[4] * y + c_warpMat[5];
+
+        map_x[y * step_x + x] = xcoo;
+        map_y[y * step_y + x] = ycoo;
+    }
+}
+
+__kernel
+    void buildWarpPerspectiveMaps
+    (
+    __global float * xmap,
+    __global float * ymap,
+    __constant float * c_warpMat,
+    int cols,
+    int rows,
+    int step_x,
+    int step_y
+    )
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    step_x /= sizeof(float);
+    step_y /= sizeof(float);
+
+    if (x < cols && y < rows)
+    {
+        const float coeff = 1.0f / (c_warpMat[6] * x + c_warpMat[7] * y + c_warpMat[8]);
+
+        const float xcoo = coeff * (c_warpMat[0] * x + c_warpMat[1] * y + c_warpMat[2]);
+        const float ycoo = coeff * (c_warpMat[3] * x + c_warpMat[4] * y + c_warpMat[5]);
+
+        map_x[y * step_x + x] = xcoo;
+        map_y[y * step_y + x] = ycoo;
+    }
+}
+
--- a/modules/ocl/src/kernels/convertC3C4.cl
+++ b/modules/ocl/src/kernels/convertC3C4.cl
@@ -36,106 +36,106 @@
 #if defined (DOUBLE_SUPPORT)
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
 #endif
-__kernel void convertC3C4(__global const GENTYPE4 * restrict src, __global GENTYPE4 *dst, int cols, int rows, 
-					int dstStep_in_piexl,int pixel_end)
+__kernel void convertC3C4(__global const GENTYPE4 * restrict src, __global GENTYPE4 *dst, int cols, int rows,
+                    int dstStep_in_piexl,int pixel_end)
 {
-	int id = get_global_id(0);
-	//int pixel_end = mul24(cols -1 , rows -1);
-	int3 pixelid = (int3)(mul24(id,3),mad24(id,3,1),mad24(id,3,2));
-	pixelid = clamp(pixelid,0,pixel_end);
-	GENTYPE4 pixel0, pixel1, pixel2, outpix0,outpix1,outpix2,outpix3;
-	pixel0 = src[pixelid.x];
-	pixel1 = src[pixelid.y];
-	pixel2 = src[pixelid.z];
+    int id = get_global_id(0);
+    //int pixel_end = mul24(cols -1 , rows -1);
+    int3 pixelid = (int3)(mul24(id,3),mad24(id,3,1),mad24(id,3,2));
+    pixelid = clamp(pixelid,0,pixel_end);
+    GENTYPE4 pixel0, pixel1, pixel2, outpix0,outpix1,outpix2,outpix3;
+    pixel0 = src[pixelid.x];
+    pixel1 = src[pixelid.y];
+    pixel2 = src[pixelid.z];


-	outpix0 = (GENTYPE4)(pixel0.x,pixel0.y,pixel0.z,0);
-	outpix1 = (GENTYPE4)(pixel0.w,pixel1.x,pixel1.y,0);
-	outpix2 = (GENTYPE4)(pixel1.z,pixel1.w,pixel2.x,0);
-	outpix3 = (GENTYPE4)(pixel2.y,pixel2.z,pixel2.w,0);
+    outpix0 = (GENTYPE4)(pixel0.x,pixel0.y,pixel0.z,0);
+    outpix1 = (GENTYPE4)(pixel0.w,pixel1.x,pixel1.y,0);
+    outpix2 = (GENTYPE4)(pixel1.z,pixel1.w,pixel2.x,0);
+    outpix3 = (GENTYPE4)(pixel2.y,pixel2.z,pixel2.w,0);

-	int4 outy = (id<<2)/cols;
-	int4 outx = (id<<2)%cols;
-	outx.y++;
-	outx.z+=2;
-	outx.w+=3;
-	outy = select(outy,outy+1,outx>=cols);
-	outx = select(outx,outx-cols,outx>=cols);
-	//outpix3 = select(outpix3, outpix0, (uchar4)(outy.w>=rows));
-	//outpix2 = select(outpix2, outpix0, (uchar4)(outy.z>=rows));
-	//outpix1 = select(outpix1, outpix0, (uchar4)(outy.y>=rows));
-	//outx = select(outx,(int4)outx.x,outy>=rows);
-	//outy = select(outy,(int4)outy.x,outy>=rows);
-	int4 addr = mad24(outy,(int4)dstStep_in_piexl,outx);
-	if(outx.w<cols && outy.w<rows)
-	{
-		dst[addr.x] = outpix0;
-		dst[addr.y] = outpix1;
-		dst[addr.z] = outpix2;
-		dst[addr.w] = outpix3;
-	}
-	else if(outx.z<cols && outy.z<rows)
-	{
-		dst[addr.x] = outpix0;
-		dst[addr.y] = outpix1;
-		dst[addr.z] = outpix2;
-	}
-	else if(outx.y<cols && outy.y<rows)
-	{
-		dst[addr.x] = outpix0;
-		dst[addr.y] = outpix1;
-	}
-	else if(outx.x<cols && outy.x<rows)
-	{
-		dst[addr.x] = outpix0;
-	}	
+    int4 outy = (id<<2)/cols;
+    int4 outx = (id<<2)%cols;
+    outx.y++;
+    outx.z+=2;
+    outx.w+=3;
+    outy = select(outy,outy+1,outx>=cols);
+    outx = select(outx,outx-cols,outx>=cols);
+    //outpix3 = select(outpix3, outpix0, (uchar4)(outy.w>=rows));
+    //outpix2 = select(outpix2, outpix0, (uchar4)(outy.z>=rows));
+    //outpix1 = select(outpix1, outpix0, (uchar4)(outy.y>=rows));
+    //outx = select(outx,(int4)outx.x,outy>=rows);
+    //outy = select(outy,(int4)outy.x,outy>=rows);
+    int4 addr = mad24(outy,(int4)dstStep_in_piexl,outx);
+    if(outx.w<cols && outy.w<rows)
+    {
+        dst[addr.x] = outpix0;
+        dst[addr.y] = outpix1;
+        dst[addr.z] = outpix2;
+        dst[addr.w] = outpix3;
+    }
+    else if(outx.z<cols && outy.z<rows)
+    {
+        dst[addr.x] = outpix0;
+        dst[addr.y] = outpix1;
+        dst[addr.z] = outpix2;
+    }
+    else if(outx.y<cols && outy.y<rows)
+    {
+        dst[addr.x] = outpix0;
+        dst[addr.y] = outpix1;
+    }
+    else if(outx.x<cols && outy.x<rows)
+    {
+        dst[addr.x] = outpix0;
+    }
 }




-__kernel void convertC4C3(__global const GENTYPE4 * restrict src, __global GENTYPE4 *dst, int cols, int rows, 
-					int srcStep_in_pixel,int pixel_end)
+__kernel void convertC4C3(__global const GENTYPE4 * restrict src, __global GENTYPE4 *dst, int cols, int rows,
+                    int srcStep_in_pixel,int pixel_end)
 {
-	int id = get_global_id(0)<<2;
-	int y = id / cols;
-	int x = id % cols;
-	int4 x4 = (int4)(x,x+1,x+2,x+3);
-	int4 y4 = select((int4)y,(int4)(y+1),x4>=(int4)cols);
-	y4=clamp(y4,(int4)0,(int4)(rows-1));
-	x4 = select(x4,x4-(int4)cols,x4>=(int4)cols);
-	int4 addr = mad24(y4,(int4)srcStep_in_pixel,x4);
-	GENTYPE4 pixel0,pixel1,pixel2,pixel3, outpixel1, outpixel2;
-	pixel0 = src[addr.x];
-	pixel1 = src[addr.y];
-	pixel2 = src[addr.z];
-	pixel3 = src[addr.w];
+    int id = get_global_id(0)<<2;
+    int y = id / cols;
+    int x = id % cols;
+    int4 x4 = (int4)(x,x+1,x+2,x+3);
+    int4 y4 = select((int4)y,(int4)(y+1),x4>=(int4)cols);
+    y4=clamp(y4,(int4)0,(int4)(rows-1));
+    x4 = select(x4,x4-(int4)cols,x4>=(int4)cols);
+    int4 addr = mad24(y4,(int4)srcStep_in_pixel,x4);
+    GENTYPE4 pixel0,pixel1,pixel2,pixel3, outpixel1, outpixel2;
+    pixel0 = src[addr.x];
+    pixel1 = src[addr.y];
+    pixel2 = src[addr.z];
+    pixel3 = src[addr.w];

-	pixel0.w = pixel1.x;
-	outpixel1.x = pixel1.y;
-	outpixel1.y = pixel1.z;
-	outpixel1.z = pixel2.x;
-	outpixel1.w = pixel2.y;
-	outpixel2.x = pixel2.z;
-	outpixel2.y = pixel3.x;
-	outpixel2.z = pixel3.y;
-	outpixel2.w = pixel3.z;
-	int4 outaddr = mul24(id>>2 , 3);
-	outaddr.y++;
-	outaddr.z+=2;
-	if(outaddr.z <= pixel_end)
-	{
-		dst[outaddr.x] = pixel0;
-		dst[outaddr.y] = outpixel1;
-		dst[outaddr.z] = outpixel2;
-	}
-	else if(outaddr.y <= pixel_end)
-	{
-		dst[outaddr.x] = pixel0;
-		dst[outaddr.y] = outpixel1;
-	}
-	else if(outaddr.x <= pixel_end)
-	{
-		dst[outaddr.x] = pixel0;
-	}	
+    pixel0.w = pixel1.x;
+    outpixel1.x = pixel1.y;
+    outpixel1.y = pixel1.z;
+    outpixel1.z = pixel2.x;
+    outpixel1.w = pixel2.y;
+    outpixel2.x = pixel2.z;
+    outpixel2.y = pixel3.x;
+    outpixel2.z = pixel3.y;
+    outpixel2.w = pixel3.z;
+    int4 outaddr = mul24(id>>2 , 3);
+    outaddr.y++;
+    outaddr.z+=2;
+    if(outaddr.z <= pixel_end)
+    {
+        dst[outaddr.x] = pixel0;
+        dst[outaddr.y] = outpixel1;
+        dst[outaddr.z] = outpixel2;
+    }
+    else if(outaddr.y <= pixel_end)
+    {
+        dst[outaddr.x] = pixel0;
+        dst[outaddr.y] = outpixel1;
+    }
+    else if(outaddr.x <= pixel_end)
+    {
+        dst[outaddr.x] = pixel0;
+    }
 }
--- a/modules/ocl/src/kernels/cvt_color.cl
+++ b/modules/ocl/src/kernels/cvt_color.cl
@@ -78,4 +78,4 @@ __kernel void RGB2Gray(int cols,int rows,int src_step,int dst_step,int channels,
        int dst_idx = y * dst_step + x * sizeof(DATA_TYPE);
        dst[dst_idx] = (DATA_TYPE)CV_DESCALE((src[src_idx + bidx] * B2Y + src[src_idx + 1] * G2Y + src[src_idx + (bidx^2)] * R2Y), yuv_shift);
    }
-}   
+}
--- a/modules/ocl/src/kernels/filter_sep_col.cl
+++ b/modules/ocl/src/kernels/filter_sep_col.cl
@@ -83,7 +83,7 @@ Now(6/29/2011) the kernels only support 8U data type and the anchor of the convo
 kernel must be in the center. ROI is not supported either.
 Each kernels read 4 elements(not 4 pixels), save them to LDS and read the data needed
 from LDS to calculate the result.
-The length of the convovle kernel supported is only related to the MAX size of LDS, 
+The length of the convovle kernel supported is only related to the MAX size of LDS,
 which is HW related.
 Niko
 6/29/2011
@@ -92,56 +92,56 @@ The info above maybe obsolete.


 __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void col_filter
-						(__global const GENTYPE_SRC * restrict src, 
-						 __global GENTYPE_DST * dst,
+                        (__global const GENTYPE_SRC * restrict src,
+                         __global GENTYPE_DST * dst,
                         const int dst_cols,
-                         const int dst_rows, 
-						 const int src_whole_cols,
-						 const int src_whole_rows,
-                         const int src_step_in_pixel, 
-                         //const int src_offset_x, 
-                         //const int src_offset_y, 
+                         const int dst_rows,
+                         const int src_whole_cols,
+                         const int src_whole_rows,
+                         const int src_step_in_pixel,
+                         //const int src_offset_x,
+                         //const int src_offset_y,
                         const int dst_step_in_pixel,
                         const int dst_offset_in_pixel,
                         __constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSY+1)))))
 {
-	int x = get_global_id(0);
-	int y = get_global_id(1);
-	int l_x = get_local_id(0);
-	int l_y = get_local_id(1);
-	int start_addr = mad24(y,src_step_in_pixel,x);
-	int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
-	int i;
-	GENTYPE_SRC sum;
-	GENTYPE_SRC temp[READ_TIMES_COL];
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    int l_x = get_local_id(0);
+    int l_y = get_local_id(1);
+    int start_addr = mad24(y,src_step_in_pixel,x);
+    int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
+    int i;
+    GENTYPE_SRC sum;
+    GENTYPE_SRC temp[READ_TIMES_COL];

-	__local GENTYPE_SRC LDS_DAT[LSIZE1*READ_TIMES_COL][LSIZE0+1];
+    __local GENTYPE_SRC LDS_DAT[LSIZE1*READ_TIMES_COL][LSIZE0+1];

-	//read pixels from src
-	for(i = 0;i<READ_TIMES_COL;i++)
-	{
-		int current_addr = start_addr+i*LSIZE1*src_step_in_pixel;
-		current_addr = current_addr < end_addr ? current_addr : 0;
-		temp[i] = src[current_addr];
-	}
-	//save pixels to lds
-	for(i = 0;i<READ_TIMES_COL;i++)
-	{
-		LDS_DAT[l_y+i*LSIZE1][l_x] = temp[i];
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-	//read pixels from lds and calculate the result
-	sum = LDS_DAT[l_y+RADIUSY][l_x]*mat_kernel[RADIUSY];
-	for(i=1;i<=RADIUSY;i++)
-	{
-		temp[0]=LDS_DAT[l_y+RADIUSY-i][l_x];
-		temp[1]=LDS_DAT[l_y+RADIUSY+i][l_x];
-		sum += temp[0] * mat_kernel[RADIUSY-i]+temp[1] * mat_kernel[RADIUSY+i];
-	}
-	//write the result to dst
-	if((x<dst_cols) & (y<dst_rows))
-	{
-		start_addr = mad24(y,dst_step_in_pixel,x+dst_offset_in_pixel);
-		dst[start_addr] = convert_to_DST(sum);
-	}
+    //read pixels from src
+    for(i = 0;i<READ_TIMES_COL;i++)
+    {
+        int current_addr = start_addr+i*LSIZE1*src_step_in_pixel;
+        current_addr = current_addr < end_addr ? current_addr : 0;
+        temp[i] = src[current_addr];
+    }
+    //save pixels to lds
+    for(i = 0;i<READ_TIMES_COL;i++)
+    {
+        LDS_DAT[l_y+i*LSIZE1][l_x] = temp[i];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    //read pixels from lds and calculate the result
+    sum = LDS_DAT[l_y+RADIUSY][l_x]*mat_kernel[RADIUSY];
+    for(i=1;i<=RADIUSY;i++)
+    {
+        temp[0]=LDS_DAT[l_y+RADIUSY-i][l_x];
+        temp[1]=LDS_DAT[l_y+RADIUSY+i][l_x];
+        sum += temp[0] * mat_kernel[RADIUSY-i]+temp[1] * mat_kernel[RADIUSY+i];
+    }
+    //write the result to dst
+    if((x<dst_cols) & (y<dst_rows))
+    {
+        start_addr = mad24(y,dst_step_in_pixel,x+dst_offset_in_pixel);
+        dst[start_addr] = convert_to_DST(sum);
+    }
 }
--- a/modules/ocl/src/kernels/filter_sep_row.cl
+++ b/modules/ocl/src/kernels/filter_sep_row.cl
@@ -83,7 +83,7 @@ These kernels are written for separable filters such as Sobel, Scharr, GaussianB
 Now(6/29/2011) the kernels only support 8U data type and the anchor of the convovle
 kernel must be in the center. ROI is not supported either.
 For channels =1,2,4, each kernels read 4 elements(not 4 pixels), and for channels =3,
-the kernel read 4 pixels, save them to LDS and read the data needed from LDS to 
+the kernel read 4 pixels, save them to LDS and read the data needed from LDS to
 calculate the result.
 The length of the convovle kernel supported is related to the LSIZE0 and the MAX size
 of LDS, which is HW related.
@@ -96,375 +96,375 @@ The info above maybe obsolete.
 ***********************************************************************************/

 __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C1_D0
-						(__global const uchar * restrict src, 
-						 __global float * dst,
+                        (__global const uchar * restrict src,
+                         __global float * dst,
                         const int dst_cols,
-                         const int dst_rows, 
-						 const int src_whole_cols,
-						 const int src_whole_rows,
-                         const int src_step_in_pixel, 
-                         const int src_offset_x, 
-                         const int src_offset_y, 
+                         const int dst_rows,
+                         const int src_whole_cols,
+                         const int src_whole_rows,
+                         const int src_step_in_pixel,
+                         const int src_offset_x,
+                         const int src_offset_y,
                         const int dst_step_in_pixel,
                         const int radiusy,
                         __constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSX+1)))))
 {
-	int x = get_global_id(0)<<2;
-	int y = get_global_id(1);
-	int l_x = get_local_id(0);
-	int l_y = get_local_id(1);
-	int start_x = x+src_offset_x-RADIUSX & 0xfffffffc;
-	int offset = src_offset_x-RADIUSX & 3;
-	int start_y = y+src_offset_y-radiusy;
-	int start_addr = mad24(start_y,src_step_in_pixel,start_x);
-	int i;
-	float4 sum;
-	uchar4 temp[READ_TIMES_ROW];
+    int x = get_global_id(0)<<2;
+    int y = get_global_id(1);
+    int l_x = get_local_id(0);
+    int l_y = get_local_id(1);
+    int start_x = x+src_offset_x-RADIUSX & 0xfffffffc;
+    int offset = src_offset_x-RADIUSX & 3;
+    int start_y = y+src_offset_y-radiusy;
+    int start_addr = mad24(start_y,src_step_in_pixel,start_x);
+    int i;
+    float4 sum;
+    uchar4 temp[READ_TIMES_ROW];

-	__local uchar4 LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1];
-	#ifdef BORDER_CONSTANT
-	int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
-	//read pixels from src
-	for(i = 0;i<READ_TIMES_ROW;i++)
-	{
-		int current_addr = start_addr+i*LSIZE0*4;
-		current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
-		temp[i] = *(__global uchar4*)&src[current_addr];
-	}
-	//judge if read out of boundary
-	for(i = 0;i<READ_TIMES_ROW;i++)
-	{
-		temp[i].x= ELEM(start_x+i*LSIZE0*4,0,src_whole_cols,0,temp[i].x);
-		temp[i].y= ELEM(start_x+i*LSIZE0*4+1,0,src_whole_cols,0,temp[i].y);
-		temp[i].z= ELEM(start_x+i*LSIZE0*4+2,0,src_whole_cols,0,temp[i].z);
-		temp[i].w= ELEM(start_x+i*LSIZE0*4+3,0,src_whole_cols,0,temp[i].w);
-		temp[i]= ELEM(start_y,0,src_whole_rows,(uchar4)0,temp[i]);
-	}
-	#else
-	int not_all_in_range = (start_x<0) | (start_x + READ_TIMES_ROW*LSIZE0*4+4>src_whole_cols)| (start_y<0) | (start_y >= src_whole_rows);
-	int4 index[READ_TIMES_ROW];
-	int4 addr;
-	int s_y;
-	if(not_all_in_range)
-	{
-		//judge if read out of boundary
-		for(i = 0;i<READ_TIMES_ROW;i++)
-		{
-			index[i].x= ADDR_L(start_x+i*LSIZE0*4,0,src_whole_cols,start_x+i*LSIZE0*4);
-			index[i].x= ADDR_R(start_x+i*LSIZE0*4,src_whole_cols,index[i].x);
-			index[i].y= ADDR_L(start_x+i*LSIZE0*4+1,0,src_whole_cols,start_x+i*LSIZE0*4+1);
-			index[i].y= ADDR_R(start_x+i*LSIZE0*4+1,src_whole_cols,index[i].y);
-			index[i].z= ADDR_L(start_x+i*LSIZE0*4+2,0,src_whole_cols,start_x+i*LSIZE0*4+2);
-			index[i].z= ADDR_R(start_x+i*LSIZE0*4+2,src_whole_cols,index[i].z);
-			index[i].w= ADDR_L(start_x+i*LSIZE0*4+3,0,src_whole_cols,start_x+i*LSIZE0*4+3);
-			index[i].w= ADDR_R(start_x+i*LSIZE0*4+3,src_whole_cols,index[i].w);
-		}
-		s_y= ADDR_L(start_y,0,src_whole_rows,start_y);
-		s_y= ADDR_R(start_y,src_whole_rows,s_y);
-		//read pixels from src
-		for(i = 0;i<READ_TIMES_ROW;i++)
-		{
-			addr = mad24((int4)s_y,(int4)src_step_in_pixel,index[i]);
-			temp[i].x = src[addr.x];
-			temp[i].y = src[addr.y];
-			temp[i].z = src[addr.z];
-			temp[i].w = src[addr.w];
-		}
-	}
-	else
-	{
-		//read pixels from src
-		for(i = 0;i<READ_TIMES_ROW;i++)
-		{
-			temp[i] = *(__global uchar4*)&src[start_addr+i*LSIZE0*4];
-		}	
-	}	
-	#endif
+    __local uchar4 LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1];
+    #ifdef BORDER_CONSTANT
+    int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
+    //read pixels from src
+    for(i = 0;i<READ_TIMES_ROW;i++)
+    {
+        int current_addr = start_addr+i*LSIZE0*4;
+        current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
+        temp[i] = *(__global uchar4*)&src[current_addr];
+    }
+    //judge if read out of boundary
+    for(i = 0;i<READ_TIMES_ROW;i++)
+    {
+        temp[i].x= ELEM(start_x+i*LSIZE0*4,0,src_whole_cols,0,temp[i].x);
+        temp[i].y= ELEM(start_x+i*LSIZE0*4+1,0,src_whole_cols,0,temp[i].y);
+        temp[i].z= ELEM(start_x+i*LSIZE0*4+2,0,src_whole_cols,0,temp[i].z);
+        temp[i].w= ELEM(start_x+i*LSIZE0*4+3,0,src_whole_cols,0,temp[i].w);
+        temp[i]= ELEM(start_y,0,src_whole_rows,(uchar4)0,temp[i]);
+    }
+    #else
+    int not_all_in_range = (start_x<0) | (start_x + READ_TIMES_ROW*LSIZE0*4+4>src_whole_cols)| (start_y<0) | (start_y >= src_whole_rows);
+    int4 index[READ_TIMES_ROW];
+    int4 addr;
+    int s_y;
+    if(not_all_in_range)
+    {
+        //judge if read out of boundary
+        for(i = 0;i<READ_TIMES_ROW;i++)
+        {
+            index[i].x= ADDR_L(start_x+i*LSIZE0*4,0,src_whole_cols,start_x+i*LSIZE0*4);
+            index[i].x= ADDR_R(start_x+i*LSIZE0*4,src_whole_cols,index[i].x);
+            index[i].y= ADDR_L(start_x+i*LSIZE0*4+1,0,src_whole_cols,start_x+i*LSIZE0*4+1);
+            index[i].y= ADDR_R(start_x+i*LSIZE0*4+1,src_whole_cols,index[i].y);
+            index[i].z= ADDR_L(start_x+i*LSIZE0*4+2,0,src_whole_cols,start_x+i*LSIZE0*4+2);
+            index[i].z= ADDR_R(start_x+i*LSIZE0*4+2,src_whole_cols,index[i].z);
+            index[i].w= ADDR_L(start_x+i*LSIZE0*4+3,0,src_whole_cols,start_x+i*LSIZE0*4+3);
+            index[i].w= ADDR_R(start_x+i*LSIZE0*4+3,src_whole_cols,index[i].w);
+        }
+        s_y= ADDR_L(start_y,0,src_whole_rows,start_y);
+        s_y= ADDR_R(start_y,src_whole_rows,s_y);
+        //read pixels from src
+        for(i = 0;i<READ_TIMES_ROW;i++)
+        {
+            addr = mad24((int4)s_y,(int4)src_step_in_pixel,index[i]);
+            temp[i].x = src[addr.x];
+            temp[i].y = src[addr.y];
+            temp[i].z = src[addr.z];
+            temp[i].w = src[addr.w];
+        }
+    }
+    else
+    {
+        //read pixels from src
+        for(i = 0;i<READ_TIMES_ROW;i++)
+        {
+            temp[i] = *(__global uchar4*)&src[start_addr+i*LSIZE0*4];
+        }
+    }
+    #endif

-	//save pixels to lds
-	for(i = 0;i<READ_TIMES_ROW;i++)
-	{
-		LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i];
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
+    //save pixels to lds
+    for(i = 0;i<READ_TIMES_ROW;i++)
+    {
+        LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);

-	//read pixels from lds and calculate the result
-	sum =convert_float4(vload4(0,(__local uchar*)&LDS_DAT[l_y][l_x]+RADIUSX+offset))*mat_kernel[RADIUSX];
-	for(i=1;i<=RADIUSX;i++)
-	{
-		temp[0]=vload4(0,(__local uchar*)&LDS_DAT[l_y][l_x]+RADIUSX+offset-i);
-		temp[1]=vload4(0,(__local uchar*)&LDS_DAT[l_y][l_x]+RADIUSX+offset+i);
-		sum += convert_float4(temp[0])*mat_kernel[RADIUSX-i]+convert_float4(temp[1])*mat_kernel[RADIUSX+i];
-	}
-	start_addr = mad24(y,dst_step_in_pixel,x);
-	//write the result to dst
-	if((x+3<dst_cols) & (y<dst_rows))
-	{
-		*(__global float4*)&dst[start_addr] = sum;
-	}
-	else if((x+2<dst_cols) & (y<dst_rows))
-	{
-		dst[start_addr] = sum.x;
-		dst[start_addr+1] = sum.y;
-		dst[start_addr+2] = sum.z;
-	}
-	else if((x+1<dst_cols) & (y<dst_rows))
-	{
-		dst[start_addr] = sum.x;
-		dst[start_addr+1] = sum.y;
-	}
-	else if((x<dst_cols) & (y<dst_rows))
-	{
-		dst[start_addr] = sum.x;
-	}
+    //read pixels from lds and calculate the result
+    sum =convert_float4(vload4(0,(__local uchar*)&LDS_DAT[l_y][l_x]+RADIUSX+offset))*mat_kernel[RADIUSX];
+    for(i=1;i<=RADIUSX;i++)
+    {
+        temp[0]=vload4(0,(__local uchar*)&LDS_DAT[l_y][l_x]+RADIUSX+offset-i);
+        temp[1]=vload4(0,(__local uchar*)&LDS_DAT[l_y][l_x]+RADIUSX+offset+i);
+        sum += convert_float4(temp[0])*mat_kernel[RADIUSX-i]+convert_float4(temp[1])*mat_kernel[RADIUSX+i];
+    }
+    start_addr = mad24(y,dst_step_in_pixel,x);
+    //write the result to dst
+    if((x+3<dst_cols) & (y<dst_rows))
+    {
+        *(__global float4*)&dst[start_addr] = sum;
+    }
+    else if((x+2<dst_cols) & (y<dst_rows))
+    {
+        dst[start_addr] = sum.x;
+        dst[start_addr+1] = sum.y;
+        dst[start_addr+2] = sum.z;
+    }
+    else if((x+1<dst_cols) & (y<dst_rows))
+    {
+        dst[start_addr] = sum.x;
+        dst[start_addr+1] = sum.y;
+    }
+    else if((x<dst_cols) & (y<dst_rows))
+    {
+        dst[start_addr] = sum.x;
+    }
 }
 __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C4_D0
-						(__global const uchar4 * restrict src, 
-						 __global float4 * dst,
+                        (__global const uchar4 * restrict src,
+                         __global float4 * dst,
                         const int dst_cols,
-                         const int dst_rows, 
-						 const int src_whole_cols,
-						 const int src_whole_rows,
-                         const int src_step_in_pixel, 
-                         const int src_offset_x, 
-                         const int src_offset_y, 
+                         const int dst_rows,
+                         const int src_whole_cols,
+                         const int src_whole_rows,
+                         const int src_step_in_pixel,
+                         const int src_offset_x,
+                         const int src_offset_y,
                         const int dst_step_in_pixel,
                         const int radiusy,
                         __constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSX+1)))))
 {
-	int x = get_global_id(0);
-	int y = get_global_id(1);
-	int l_x = get_local_id(0);
-	int l_y = get_local_id(1);
-	int start_x = x+src_offset_x-RADIUSX;
-	int start_y = y+src_offset_y-radiusy;
-	int start_addr = mad24(start_y,src_step_in_pixel,start_x);
-	int i;
-	float4 sum;
-	uchar4 temp[READ_TIMES_ROW];
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    int l_x = get_local_id(0);
+    int l_y = get_local_id(1);
+    int start_x = x+src_offset_x-RADIUSX;
+    int start_y = y+src_offset_y-radiusy;
+    int start_addr = mad24(start_y,src_step_in_pixel,start_x);
+    int i;
+    float4 sum;
+    uchar4 temp[READ_TIMES_ROW];

-	__local uchar4 LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1];
-	#ifdef BORDER_CONSTANT
-	int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
-	//read pixels from src
-	for(i = 0;i<READ_TIMES_ROW;i++)
-	{
-		int current_addr = start_addr+i*LSIZE0;
-		current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
-		temp[i] = src[current_addr];
-	}
-	//judge if read out of boundary
-	for(i = 0;i<READ_TIMES_ROW;i++)
-	{
-		temp[i]= ELEM(start_x+i*LSIZE0,0,src_whole_cols,(uchar4)0,temp[i]);
-		temp[i]= ELEM(start_y,0,src_whole_rows,(uchar4)0,temp[i]);
-	}
-	#else
-	int index[READ_TIMES_ROW];
-	int s_x,s_y;
-	//judge if read out of boundary
-	for(i = 0;i<READ_TIMES_ROW;i++)
-	{
-		s_x= ADDR_L(start_x+i*LSIZE0,0,src_whole_cols,start_x+i*LSIZE0);
-		s_x= ADDR_R(start_x+i*LSIZE0,src_whole_cols,s_x);
-		s_y= ADDR_L(start_y,0,src_whole_rows,start_y);
-		s_y= ADDR_R(start_y,src_whole_rows,s_y);
-		index[i]=mad24(s_y,src_step_in_pixel,s_x);
-	}
-	//read pixels from src
-	for(i = 0;i<READ_TIMES_ROW;i++)
-	{
-		temp[i] = src[index[i]];
-	}	
-	#endif
+    __local uchar4 LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1];
+    #ifdef BORDER_CONSTANT
+    int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
+    //read pixels from src
+    for(i = 0;i<READ_TIMES_ROW;i++)
+    {
+        int current_addr = start_addr+i*LSIZE0;
+        current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
+        temp[i] = src[current_addr];
+    }
+    //judge if read out of boundary
+    for(i = 0;i<READ_TIMES_ROW;i++)
+    {
+        temp[i]= ELEM(start_x+i*LSIZE0,0,src_whole_cols,(uchar4)0,temp[i]);
+        temp[i]= ELEM(start_y,0,src_whole_rows,(uchar4)0,temp[i]);
+    }
+    #else
+    int index[READ_TIMES_ROW];
+    int s_x,s_y;
+    //judge if read out of boundary
+    for(i = 0;i<READ_TIMES_ROW;i++)
+    {
+        s_x= ADDR_L(start_x+i*LSIZE0,0,src_whole_cols,start_x+i*LSIZE0);
+        s_x= ADDR_R(start_x+i*LSIZE0,src_whole_cols,s_x);
+        s_y= ADDR_L(start_y,0,src_whole_rows,start_y);
+        s_y= ADDR_R(start_y,src_whole_rows,s_y);
+        index[i]=mad24(s_y,src_step_in_pixel,s_x);
+    }
+    //read pixels from src
+    for(i = 0;i<READ_TIMES_ROW;i++)
+    {
+        temp[i] = src[index[i]];
+    }
+    #endif

-	//save pixels to lds
-	for(i = 0;i<READ_TIMES_ROW;i++)
-	{
-		LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i];
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
+    //save pixels to lds
+    for(i = 0;i<READ_TIMES_ROW;i++)
+    {
+        LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);

-	//read pixels from lds and calculate the result
-	sum =convert_float4(LDS_DAT[l_y][l_x+RADIUSX])*mat_kernel[RADIUSX];
-	for(i=1;i<=RADIUSX;i++)
-	{
-		temp[0]=LDS_DAT[l_y][l_x+RADIUSX-i];
-		temp[1]=LDS_DAT[l_y][l_x+RADIUSX+i];
-		sum += convert_float4(temp[0])*mat_kernel[RADIUSX-i]+convert_float4(temp[1])*mat_kernel[RADIUSX+i];
-	}
-	//write the result to dst
-	if((x<dst_cols) & (y<dst_rows))
-	{
-		start_addr = mad24(y,dst_step_in_pixel,x);
-		dst[start_addr] = sum;
-	}
+    //read pixels from lds and calculate the result
+    sum =convert_float4(LDS_DAT[l_y][l_x+RADIUSX])*mat_kernel[RADIUSX];
+    for(i=1;i<=RADIUSX;i++)
+    {
+        temp[0]=LDS_DAT[l_y][l_x+RADIUSX-i];
+        temp[1]=LDS_DAT[l_y][l_x+RADIUSX+i];
+        sum += convert_float4(temp[0])*mat_kernel[RADIUSX-i]+convert_float4(temp[1])*mat_kernel[RADIUSX+i];
+    }
+    //write the result to dst
+    if((x<dst_cols) & (y<dst_rows))
+    {
+        start_addr = mad24(y,dst_step_in_pixel,x);
+        dst[start_addr] = sum;
+    }
 }

 __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C1_D5
-						(__global const float * restrict src, 
-						 __global float * dst,
+                        (__global const float * restrict src,
+                         __global float * dst,
                         const int dst_cols,
-                         const int dst_rows, 
-						 const int src_whole_cols,
-						 const int src_whole_rows,
-                         const int src_step_in_pixel, 
-                         const int src_offset_x, 
-                         const int src_offset_y, 
+                         const int dst_rows,
+                         const int src_whole_cols,
+                         const int src_whole_rows,
+                         const int src_step_in_pixel,
+                         const int src_offset_x,
+                         const int src_offset_y,
                         const int dst_step_in_pixel,
                         const int radiusy,
                         __constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSX+1)))))
 {
-	int x = get_global_id(0);
-	int y = get_global_id(1);
-	int l_x = get_local_id(0);
-	int l_y = get_local_id(1);
-	int start_x = x+src_offset_x-RADIUSX;
-	int start_y = y+src_offset_y-radiusy;
-	int start_addr = mad24(start_y,src_step_in_pixel,start_x);
-	int i;
-	float sum;
-	float temp[READ_TIMES_ROW];
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    int l_x = get_local_id(0);
+    int l_y = get_local_id(1);
+    int start_x = x+src_offset_x-RADIUSX;
+    int start_y = y+src_offset_y-radiusy;
+    int start_addr = mad24(start_y,src_step_in_pixel,start_x);
+    int i;
+    float sum;
+    float temp[READ_TIMES_ROW];

-	__local float LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1];
-	#ifdef BORDER_CONSTANT
-	int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
-	//read pixels from src
-	for(i = 0;i<READ_TIMES_ROW;i++)
-	{
-		int current_addr = start_addr+i*LSIZE0;
-		current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
-		temp[i] = src[current_addr];
-	}
-	//judge if read out of boundary
-	for(i = 0;i<READ_TIMES_ROW;i++)
-	{
-		temp[i]= ELEM(start_x+i*LSIZE0,0,src_whole_cols,0,temp[i]);
-		temp[i]= ELEM(start_y,0,src_whole_rows,0,temp[i]);
-	}
-	#else
-	int index[READ_TIMES_ROW];
-	int s_x,s_y;
-	//judge if read out of boundary
-	for(i = 0;i<READ_TIMES_ROW;i++)
-	{
-		s_x= ADDR_L(start_x+i*LSIZE0,0,src_whole_cols,start_x+i*LSIZE0);
-		s_x= ADDR_R(start_x+i*LSIZE0,src_whole_cols,s_x);
-		s_y= ADDR_L(start_y,0,src_whole_rows,start_y);
-		s_y= ADDR_R(start_y,src_whole_rows,s_y);
-		index[i]=mad24(s_y,src_step_in_pixel,s_x);
-	}
-	//read pixels from src
-	for(i = 0;i<READ_TIMES_ROW;i++)
-	{
-		temp[i] = src[index[i]];
-	}	
-	#endif
+    __local float LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1];
+    #ifdef BORDER_CONSTANT
+    int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
+    //read pixels from src
+    for(i = 0;i<READ_TIMES_ROW;i++)
+    {
+        int current_addr = start_addr+i*LSIZE0;
+        current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
+        temp[i] = src[current_addr];
+    }
+    //judge if read out of boundary
+    for(i = 0;i<READ_TIMES_ROW;i++)
+    {
+        temp[i]= ELEM(start_x+i*LSIZE0,0,src_whole_cols,0,temp[i]);
+        temp[i]= ELEM(start_y,0,src_whole_rows,0,temp[i]);
+    }
+    #else
+    int index[READ_TIMES_ROW];
+    int s_x,s_y;
+    //judge if read out of boundary
+    for(i = 0;i<READ_TIMES_ROW;i++)
+    {
+        s_x= ADDR_L(start_x+i*LSIZE0,0,src_whole_cols,start_x+i*LSIZE0);
+        s_x= ADDR_R(start_x+i*LSIZE0,src_whole_cols,s_x);
+        s_y= ADDR_L(start_y,0,src_whole_rows,start_y);
+        s_y= ADDR_R(start_y,src_whole_rows,s_y);
+        index[i]=mad24(s_y,src_step_in_pixel,s_x);
+    }
+    //read pixels from src
+    for(i = 0;i<READ_TIMES_ROW;i++)
+    {
+        temp[i] = src[index[i]];
+    }
+    #endif

-	//save pixels to lds
-	for(i = 0;i<READ_TIMES_ROW;i++)
-	{
-		LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i];
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
+    //save pixels to lds
+    for(i = 0;i<READ_TIMES_ROW;i++)
+    {
+        LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);

-	//read pixels from lds and calculate the result
-	sum =LDS_DAT[l_y][l_x+RADIUSX]*mat_kernel[RADIUSX];
-	for(i=1;i<=RADIUSX;i++)
-	{
-		temp[0]=LDS_DAT[l_y][l_x+RADIUSX-i];
-		temp[1]=LDS_DAT[l_y][l_x+RADIUSX+i];
-		sum += temp[0]*mat_kernel[RADIUSX-i]+temp[1]*mat_kernel[RADIUSX+i];
-	}
-	//write the result to dst
-	if((x<dst_cols) & (y<dst_rows))
-	{
-		start_addr = mad24(y,dst_step_in_pixel,x);
-		dst[start_addr] = sum;
-	}
+    //read pixels from lds and calculate the result
+    sum =LDS_DAT[l_y][l_x+RADIUSX]*mat_kernel[RADIUSX];
+    for(i=1;i<=RADIUSX;i++)
+    {
+        temp[0]=LDS_DAT[l_y][l_x+RADIUSX-i];
+        temp[1]=LDS_DAT[l_y][l_x+RADIUSX+i];
+        sum += temp[0]*mat_kernel[RADIUSX-i]+temp[1]*mat_kernel[RADIUSX+i];
+    }
+    //write the result to dst
+    if((x<dst_cols) & (y<dst_rows))
+    {
+        start_addr = mad24(y,dst_step_in_pixel,x);
+        dst[start_addr] = sum;
+    }
 }

 __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C4_D5
-						(__global const float4 * restrict src, 
-						 __global float4 * dst,
+                        (__global const float4 * restrict src,
+                         __global float4 * dst,
                         const int dst_cols,
-                         const int dst_rows, 
-						 const int src_whole_cols,
-						 const int src_whole_rows,
-                         const int src_step_in_pixel, 
-                         const int src_offset_x, 
-                         const int src_offset_y, 
+                         const int dst_rows,
+                         const int src_whole_cols,
+                         const int src_whole_rows,
+                         const int src_step_in_pixel,
+                         const int src_offset_x,
+                         const int src_offset_y,
                         const int dst_step_in_pixel,
                         const int radiusy,
                         __constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSX+1)))))
 {
-	int x = get_global_id(0);
-	int y = get_global_id(1);
-	int l_x = get_local_id(0);
-	int l_y = get_local_id(1);
-	int start_x = x+src_offset_x-RADIUSX;
-	int start_y = y+src_offset_y-radiusy;
-	int start_addr = mad24(start_y,src_step_in_pixel,start_x);
-	int i;
-	float4 sum;
-	float4 temp[READ_TIMES_ROW];
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    int l_x = get_local_id(0);
+    int l_y = get_local_id(1);
+    int start_x = x+src_offset_x-RADIUSX;
+    int start_y = y+src_offset_y-radiusy;
+    int start_addr = mad24(start_y,src_step_in_pixel,start_x);
+    int i;
+    float4 sum;
+    float4 temp[READ_TIMES_ROW];

-	__local float4 LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1];
-	#ifdef BORDER_CONSTANT
-	int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
-	//read pixels from src
-	for(i = 0;i<READ_TIMES_ROW;i++)
-	{
-		int current_addr = start_addr+i*LSIZE0;
-		current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
-		temp[i] = src[current_addr];
-	}
-	//judge if read out of boundary
-	for(i = 0;i<READ_TIMES_ROW;i++)
-	{
-		temp[i]= ELEM(start_x+i*LSIZE0,0,src_whole_cols,0,temp[i]);
-		temp[i]= ELEM(start_y,0,src_whole_rows,0,temp[i]);
-	}
-	#else
-	int index[READ_TIMES_ROW];
-	int s_x,s_y;
-	//judge if read out of boundary
-	for(i = 0;i<READ_TIMES_ROW;i++)
-	{
-		s_x= ADDR_L(start_x+i*LSIZE0,0,src_whole_cols,start_x+i*LSIZE0);
-		s_x= ADDR_R(start_x+i*LSIZE0,src_whole_cols,s_x);
-		s_y= ADDR_L(start_y,0,src_whole_rows,start_y);
-		s_y= ADDR_R(start_y,src_whole_rows,s_y);
-		index[i]=mad24(s_y,src_step_in_pixel,s_x);
-	}
-	//read pixels from src
-	for(i = 0;i<READ_TIMES_ROW;i++)
-	{
-		temp[i] = src[index[i]];
-	}	
-	#endif
+    __local float4 LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1];
+    #ifdef BORDER_CONSTANT
+    int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
+    //read pixels from src
+    for(i = 0;i<READ_TIMES_ROW;i++)
+    {
+        int current_addr = start_addr+i*LSIZE0;
+        current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
+        temp[i] = src[current_addr];
+    }
+    //judge if read out of boundary
+    for(i = 0;i<READ_TIMES_ROW;i++)
+    {
+        temp[i]= ELEM(start_x+i*LSIZE0,0,src_whole_cols,0,temp[i]);
+        temp[i]= ELEM(start_y,0,src_whole_rows,0,temp[i]);
+    }
+    #else
+    int index[READ_TIMES_ROW];
+    int s_x,s_y;
+    //judge if read out of boundary
+    for(i = 0;i<READ_TIMES_ROW;i++)
+    {
+        s_x= ADDR_L(start_x+i*LSIZE0,0,src_whole_cols,start_x+i*LSIZE0);
+        s_x= ADDR_R(start_x+i*LSIZE0,src_whole_cols,s_x);
+        s_y= ADDR_L(start_y,0,src_whole_rows,start_y);
+        s_y= ADDR_R(start_y,src_whole_rows,s_y);
+        index[i]=mad24(s_y,src_step_in_pixel,s_x);
+    }
+    //read pixels from src
+    for(i = 0;i<READ_TIMES_ROW;i++)
+    {
+        temp[i] = src[index[i]];
+    }
+    #endif

-	//save pixels to lds
-	for(i = 0;i<READ_TIMES_ROW;i++)
-	{
-		LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i];
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
+    //save pixels to lds
+    for(i = 0;i<READ_TIMES_ROW;i++)
+    {
+        LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);

-	//read pixels from lds and calculate the result
-	sum =LDS_DAT[l_y][l_x+RADIUSX]*mat_kernel[RADIUSX];
-	for(i=1;i<=RADIUSX;i++)
-	{
-		temp[0]=LDS_DAT[l_y][l_x+RADIUSX-i];
-		temp[1]=LDS_DAT[l_y][l_x+RADIUSX+i];
-		sum += temp[0]*mat_kernel[RADIUSX-i]+temp[1]*mat_kernel[RADIUSX+i];
-	}
-	//write the result to dst
-	if((x<dst_cols) & (y<dst_rows))
-	{
-		start_addr = mad24(y,dst_step_in_pixel,x);
-		dst[start_addr] = sum;
-	}
+    //read pixels from lds and calculate the result
+    sum =LDS_DAT[l_y][l_x+RADIUSX]*mat_kernel[RADIUSX];
+    for(i=1;i<=RADIUSX;i++)
+    {
+        temp[0]=LDS_DAT[l_y][l_x+RADIUSX-i];
+        temp[1]=LDS_DAT[l_y][l_x+RADIUSX+i];
+        sum += temp[0]*mat_kernel[RADIUSX-i]+temp[1]*mat_kernel[RADIUSX+i];
+    }
+    //write the result to dst
+    if((x<dst_cols) & (y<dst_rows))
+    {
+        start_addr = mad24(y,dst_step_in_pixel,x);
+        dst[start_addr] = sum;
+    }
 }


--- a/modules/ocl/src/kernels/filtering_boxFilter.cl
+++ b/modules/ocl/src/kernels/filtering_boxFilter.cl
@@ -50,8 +50,8 @@
 //BORDER_REPLICATE:     aaaaaa|abcdefgh|hhhhhhh
 #define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (l_edge)   : (i))
 #define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (r_edge)-1 : (addr))
-#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (t_edge)   :(i)) 
-#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (b_edge)-1 :(addr)) 
+#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (t_edge)   :(i))
+#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (b_edge)-1 :(addr))
 #endif

 #ifdef BORDER_REFLECT
@@ -103,12 +103,12 @@ __kernel void boxFilter_C1_D0(__global const uchar * restrict src, __global ucha
    int startY = (gY << 1) - anY + src_y_off;
    int dst_startX = (gX * (THREADS-ksX+1) * 4) - head_off + dst_x_off;
    int dst_startY = (gY << 1) + dst_y_off;
-    
+
    uint4 data[ksY+1];
-    __local uint4 temp[(THREADS<<1)];   
-        
+    __local uint4 temp[(THREADS<<1)];
+
 #ifdef BORDER_CONSTANT
-    
+
        for(int i=0; i < ksY+1; i++)
        {
            if(startY+i >=0 && startY+i < src_whole_rows && startX+col*4 >=0 && startX+col*4+3<src_whole_cols)
@@ -126,15 +126,15 @@ __kernel void boxFilter_C1_D0(__global const uchar * restrict src, __global ucha
                if(con)data[i].s3 = *(src+(startY+i)*src_step + startX + col*4+3);
            }
        }
-        
+
 #else
   int not_all_in_range;
   for(int i=0; i < ksY+1; i++)
   {
-      not_all_in_range = (startX+col*4<0) | (startX+col*4+3>src_whole_cols-1) 
+      not_all_in_range = (startX+col*4<0) | (startX+col*4+3>src_whole_cols-1)
                        | (startY+i<0) | (startY+i>src_whole_rows-1);
      if(not_all_in_range)
-      {   
+      {
          int selected_row;
          int4 selected_col;
          selected_row = ADDR_H(startY+i, 0, src_whole_rows);
@@ -142,13 +142,13 @@ __kernel void boxFilter_C1_D0(__global const uchar * restrict src, __global ucha

          selected_col.x = ADDR_L(startX+col*4, 0, src_whole_cols);
          selected_col.x = ADDR_R(startX+col*4, src_whole_cols, selected_col.x);
-          
+
          selected_col.y = ADDR_L(startX+col*4+1, 0, src_whole_cols);
          selected_col.y = ADDR_R(startX+col*4+1, src_whole_cols, selected_col.y);
-          
+
          selected_col.z = ADDR_L(startX+col*4+2, 0, src_whole_cols);
          selected_col.z = ADDR_R(startX+col*4+2, src_whole_cols, selected_col.z);
-          
+
          selected_col.w = ADDR_L(startX+col*4+3, 0, src_whole_cols);
          selected_col.w = ADDR_R(startX+col*4+3, src_whole_cols, selected_col.w);

@@ -174,7 +174,7 @@ __kernel void boxFilter_C1_D0(__global const uchar * restrict src, __global ucha
    temp[col] = sum1;
    temp[col+THREADS] = sum2;
    barrier(CLK_LOCAL_MEM_FENCE);
-    
+
    if(col >= anX && col < (THREADS-ksX+anX+1))
    {
        int posX = dst_startX - dst_x_off + (col-anX)*4;
@@ -189,7 +189,7 @@ __kernel void boxFilter_C1_D0(__global const uchar * restrict src, __global ucha
        {
           tmp_sum2 += vload4(col, (__local uint*)(temp+THREADS)+i);
        }
-       
+
        if(posY < dst_rows && posX < dst_cols)
        {
           if(posX >= 0 && posX < dst_cols)
@@ -200,7 +200,7 @@ __kernel void boxFilter_C1_D0(__global const uchar * restrict src, __global ucha
               *(dst+dst_startY * dst_step + dst_startX+2 + (col-anX)*4) = tmp_sum1.z/alpha;
           if(posX+3 >= 0 && posX+3 < dst_cols)
               *(dst+dst_startY * dst_step + dst_startX+3 + (col-anX)*4) = tmp_sum1.w/alpha;
-        }   
+        }
        if(posY+1 < dst_rows && posX < dst_cols)
        {
           dst_startY+=1;
@@ -212,9 +212,9 @@ __kernel void boxFilter_C1_D0(__global const uchar * restrict src, __global ucha
               *(dst+dst_startY * dst_step + dst_startX+2 + (col-anX)*4) = tmp_sum2.z/alpha;
           if(posX+3 >= 0 && posX+3 < dst_cols)
               *(dst+dst_startY * dst_step + dst_startX+3 + (col-anX)*4) = tmp_sum2.w/alpha;
-        }   
+        }
    }
-        
+
 }

 ///////////////////////////////////////////////////////////////////////////////////////////////////
@@ -237,12 +237,12 @@ __kernel void boxFilter_C4_D0(__global const uchar4 * restrict src, __global uch
    int startX = gX * (THREADS-ksX+1) - anX + src_x_off;
    int startY = (gY << 1) - anY + src_y_off;
    int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
-    int dst_startY = (gY << 1) + dst_y_off;  
-	  //int end_addr = (src_whole_rows-1)*(src_step>>2) + src_whole_cols-4;
+    int dst_startY = (gY << 1) + dst_y_off;
+      //int end_addr = (src_whole_rows-1)*(src_step>>2) + src_whole_cols-4;

-	  int end_addr = src_whole_cols-4;
+      int end_addr = src_whole_cols-4;
    uint4 data[ksY+1];
-    __local uint4 temp[2][THREADS];   
+    __local uint4 temp[2][THREADS];
 #ifdef BORDER_CONSTANT
    bool con;
    uint4 ss;
@@ -250,12 +250,12 @@ __kernel void boxFilter_C4_D0(__global const uchar4 * restrict src, __global uch
    {
        con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows;

-		    //int cur_addr = clamp((startY+i)*(src_step>>2)+(startX+col),0,end_addr);
-        //ss = convert_uint4(src[cur_addr]); 
+            //int cur_addr = clamp((startY+i)*(src_step>>2)+(startX+col),0,end_addr);
+        //ss = convert_uint4(src[cur_addr]);

        int cur_col = clamp(startX + col, 0, src_whole_cols);
        if(con)
-          ss = convert_uint4(src[(startY+i)*(src_step>>2) + cur_col]); 
+          ss = convert_uint4(src[(startY+i)*(src_step>>2) + cur_col]);

        data[i] = con ? ss : 0;
    }
@@ -269,11 +269,11 @@ __kernel void boxFilter_C4_D0(__global const uchar4 * restrict src, __global uch

          selected_col = ADDR_L(startX+col, 0, src_whole_cols);
          selected_col = ADDR_R(startX+col, src_whole_cols, selected_col);
-          
-          
+
+
          data[i] = convert_uint4(src[selected_row * (src_step>>2) + selected_col]);
   }
-    
+
 #endif
    uint4 sum0 = 0, sum1 = 0, sum2 = 0;
    for(int i=1; i < ksY; i++)
@@ -290,7 +290,7 @@ __kernel void boxFilter_C4_D0(__global const uchar4 * restrict src, __global uch
        col += anX;
        int posX = dst_startX - dst_x_off + col - anX;
        int posY = (gY << 1);
-       
+
        uint4 tmp_sum[2]={(uint4)(0,0,0,0),(uint4)(0,0,0,0)};
        for(int k=0; k<2; k++)
            for(int i=-anX; i<=anX; i++)
@@ -298,11 +298,11 @@ __kernel void boxFilter_C4_D0(__global const uchar4 * restrict src, __global uch
                tmp_sum[k] += temp[k][col+i];
            }
        for(int i=0; i<2; i++)
-        {    
+        {
            if(posX >= 0 && posX < dst_cols && (posY+i) >= 0 && (posY+i) < dst_rows)
                dst[(dst_startY+i) * (dst_step>>2)+ dst_startX + col - anX] = convert_uchar4(convert_float4(tmp_sum[i])/alpha);
        }
-        
+
    }
 }

@@ -326,21 +326,21 @@ __kernel void boxFilter_C1_D5(__global const float *restrict src, __global float
    int startX = gX * (THREADS-ksX+1) - anX + src_x_off;
    int startY = (gY << 1) - anY + src_y_off;
    int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
-    int dst_startY = (gY << 1) + dst_y_off;  
-	int end_addr = (src_whole_rows-1)*(src_step>>2) + src_whole_cols-4;
+    int dst_startY = (gY << 1) + dst_y_off;
+    int end_addr = (src_whole_rows-1)*(src_step>>2) + src_whole_cols-4;
    float data[ksY+1];
-    __local float temp[2][THREADS];   
+    __local float temp[2][THREADS];
 #ifdef BORDER_CONSTANT
    bool con;
    float ss;
    for(int i=0; i < ksY+1; i++)
    {
        con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows;
-	      //int cur_addr = clamp((startY+i)*(src_step>>2)+(startX+col),0,end_addr);		
-        //ss = src[cur_addr]; 
-         
+          //int cur_addr = clamp((startY+i)*(src_step>>2)+(startX+col),0,end_addr);
+        //ss = src[cur_addr];
+
        int cur_col = clamp(startX + col, 0, src_whole_cols);
-        //ss = src[(startY+i)*(src_step>>2) + cur_col]; 
+        //ss = src[(startY+i)*(src_step>>2) + cur_col];
        ss = (startY+i)<src_whole_rows&&(startY+i)>=0&&cur_col>=0&&cur_col<src_whole_cols?src[(startY+i)*(src_step>>2) + cur_col]:0;

        data[i] = con ? ss : 0.f;
@@ -355,10 +355,10 @@ __kernel void boxFilter_C1_D5(__global const float *restrict src, __global float

          selected_col = ADDR_L(startX+col, 0, src_whole_cols);
          selected_col = ADDR_R(startX+col, src_whole_cols, selected_col);
-          
+
          data[i] = src[selected_row * (src_step>>2) + selected_col];
   }
-    
+
 #endif
    float sum0 = 0.0, sum1 = 0.0, sum2 = 0.0;
    for(int i=1; i < ksY; i++)
@@ -375,7 +375,7 @@ __kernel void boxFilter_C1_D5(__global const float *restrict src, __global float
        col += anX;
        int posX = dst_startX - dst_x_off + col - anX;
        int posY = (gY << 1);
-       
+
        float tmp_sum[2]={0.0, 0.0};
        for(int k=0; k<2; k++)
            for(int i=-anX; i<=anX; i++)
@@ -383,11 +383,11 @@ __kernel void boxFilter_C1_D5(__global const float *restrict src, __global float
                tmp_sum[k] += temp[k][col+i];
            }
        for(int i=0; i<2; i++)
-        {    
+        {
            if(posX >= 0 && posX < dst_cols && (posY+i) >= 0 && (posY+i) < dst_rows)
                dst[(dst_startY+i) * (dst_step>>2)+ dst_startX + col - anX] = tmp_sum[i]/alpha;
        }
-        
+
    }
 }

@@ -411,21 +411,21 @@ __kernel void boxFilter_C4_D5(__global const float4 *restrict src, __global floa
    int startX = gX * (THREADS-ksX+1) - anX + src_x_off;
    int startY = (gY << 1) - anY + src_y_off;
    int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
-    int dst_startY = (gY << 1) + dst_y_off;  
-	int end_addr = (src_whole_rows-1)*(src_step>>4) + src_whole_cols-16;
+    int dst_startY = (gY << 1) + dst_y_off;
+    int end_addr = (src_whole_rows-1)*(src_step>>4) + src_whole_cols-16;
    float4 data[ksY+1];
-    __local float4 temp[2][THREADS];   
+    __local float4 temp[2][THREADS];
 #ifdef BORDER_CONSTANT
    bool con;
    float4 ss;
    for(int i=0; i < ksY+1; i++)
    {
        con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows;
-		    //int cur_addr = clamp((startY+i)*(src_step>>4)+(startX+col),0,end_addr);		
-        //ss = src[cur_addr]; 
+            //int cur_addr = clamp((startY+i)*(src_step>>4)+(startX+col),0,end_addr);
+        //ss = src[cur_addr];

        int cur_col = clamp(startX + col, 0, src_whole_cols);
-        //ss = src[(startY+i)*(src_step>>4) + cur_col]; 
+        //ss = src[(startY+i)*(src_step>>4) + cur_col];
        ss = (startY+i)<src_whole_rows&&(startY+i)>=0&&cur_col>=0&&cur_col<src_whole_cols?src[(startY+i)*(src_step>>4) + cur_col]:0;

        data[i] = con ? ss : (float4)(0.0,0.0,0.0,0.0);
@@ -440,10 +440,10 @@ __kernel void boxFilter_C4_D5(__global const float4 *restrict src, __global floa

          selected_col = ADDR_L(startX+col, 0, src_whole_cols);
          selected_col = ADDR_R(startX+col, src_whole_cols, selected_col);
-          
+
          data[i] = src[selected_row * (src_step>>4) + selected_col];
   }
-    
+
 #endif
    float4 sum0 = 0.0, sum1 = 0.0, sum2 = 0.0;
    for(int i=1; i < ksY; i++)
@@ -460,7 +460,7 @@ __kernel void boxFilter_C4_D5(__global const float4 *restrict src, __global floa
        col += anX;
        int posX = dst_startX - dst_x_off + col - anX;
        int posY = (gY << 1);
-       
+
        float4 tmp_sum[2]={(float4)(0.0,0.0,0.0,0.0), (float4)(0.0,0.0,0.0,0.0)};
        for(int k=0; k<2; k++)
            for(int i=-anX; i<=anX; i++)
@@ -468,10 +468,10 @@ __kernel void boxFilter_C4_D5(__global const float4 *restrict src, __global floa
                tmp_sum[k] += temp[k][col+i];
            }
        for(int i=0; i<2; i++)
-        {    
+        {
            if(posX >= 0 && posX < dst_cols && (posY+i) >= 0 && (posY+i) < dst_rows)
                dst[(dst_startY+i) * (dst_step>>4)+ dst_startX + col - anX] = tmp_sum[i]/alpha;
        }
-        
+
    }
 }
--- a/modules/ocl/src/kernels/filtering_laplacian.cl
+++ b/modules/ocl/src/kernels/filtering_laplacian.cl
@@ -51,8 +51,8 @@
 //BORDER_REPLICATE:     aaaaaa|abcdefgh|hhhhhhh
 #define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (l_edge)   : (i))
 #define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (r_edge)-1 : (addr))
-#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (t_edge)   :(i)) 
-#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (b_edge)-1 :(addr)) 
+#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (t_edge)   :(i))
+#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (b_edge)-1 :(addr))
 #endif

 #ifdef BORDER_REFLECT
@@ -90,10 +90,10 @@
 #define ROWS_PER_GROUP_BITS     2
 #define ROWS_FETCH              (ROWS_PER_GROUP + ANY + ANY)   //(ROWS_PER_GROUP + anY * 2)

-#define THREADS_PER_ROW         64 
-#define THREADS_PER_ROW_BIT     6 
+#define THREADS_PER_ROW         64
+#define THREADS_PER_ROW_BIT     6

-#define ELEMENTS_PER_THREAD     4 
+#define ELEMENTS_PER_THREAD     4
 #define ELEMENTS_PER_THREAD_BIT 2

 #define LOCAL_MEM_STEP          260 //divup((get_local_size(0) + anX * 2), 4) * 4
@@ -101,10 +101,10 @@
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 /////////////////////////////////////////8uC1////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////////////////////////
-__kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x, int src_offset_y, 
-                             __global uchar *dst, int dst_step, int dst_offset_x, int dst_offset_y, 
+__kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x, int src_offset_y,
+                             __global uchar *dst, int dst_step, int dst_offset_x, int dst_offset_y,
                             __constant int *mat_kernel __attribute__((max_constant_size (16384))),
-                             int cols,int rows, int operate_cols, int wholecols, int wholerows) 
+                             int cols,int rows, int operate_cols, int wholecols, int wholerows)
 {
    int gX = get_global_id(0);
    int gY = get_global_id(1);
@@ -114,16 +114,16 @@ __kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x
    int groupX_size = get_local_size(0);
    int groupX_id   = get_group_id(0);

-    #define dst_align (dst_offset_x & 3)     
-    int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX; 
-    int rows_start_index       = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY; 
-        
+    #define dst_align (dst_offset_x & 3)
+    int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX;
+    int rows_start_index       = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY;
+
    __local uchar local_data[LOCAL_MEM_STEP * ROWS_FETCH];
    if((gY << 2) < rows)
    {
        for(int i = 0; i < ROWS_FETCH; ++i)
        {
-            if((rows_start_index - src_offset_y) + i < rows + ANY)  
+            if((rows_start_index - src_offset_y) + i < rows + ANY)
            {
                #ifdef BORDER_CONSTANT
                int selected_row  = rows_start_index + i;
@@ -132,7 +132,7 @@ __kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x
                uchar data = *(src + selected_row * src_step + selected_cols);
                int con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
                data = con ? data : 0;
-                local_data[i * LOCAL_MEM_STEP + lX ] =data; 
+                local_data[i * LOCAL_MEM_STEP + lX ] =data;

                if(lX < (ANX << 1))
                {
@@ -141,7 +141,7 @@ __kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x
                    data = *(src + selected_row * src_step + selected_cols);
                    con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
                    data = con ? data : 0;
-                    local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; 
+                    local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
                }
                #else
                int selected_row = ADDR_H(rows_start_index + i,  0, wholerows);
@@ -152,7 +152,7 @@ __kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x

                uchar data = *(src + selected_row * src_step + selected_cols);

-                local_data[i * LOCAL_MEM_STEP + lX ] =data; 
+                local_data[i * LOCAL_MEM_STEP + lX ] =data;

                if(lX < (ANX << 1))
                {
@@ -160,7 +160,7 @@ __kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x
                    selected_cols = ADDR_R(selected_cols, wholecols, selected_cols);

                    data = *(src + selected_row * src_step + selected_cols);
-                    local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; 
+                    local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
                }
                #endif
            }
@@ -171,9 +171,9 @@ __kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x
    int process_col = groupX_size * groupX_id + ((lX % THREADS_PER_ROW) << 2);
    if(((gY << 2) < rows) && (process_col < operate_cols))
    {
-        int dst_cols_start = dst_offset_x; 
+        int dst_cols_start = dst_offset_x;
        int dst_cols_end   = dst_offset_x + cols;
-        int dst_cols_index = (dst_offset_x + process_col) & 0xfffffffc;  
+        int dst_cols_index = (dst_offset_x + process_col) & 0xfffffffc;

        int dst_rows_end   = dst_offset_y + rows;
        int dst_rows_index = dst_offset_y + (gY << ROWS_PER_GROUP_BITS) + (lX >> THREADS_PER_ROW_BIT);
@@ -191,9 +191,9 @@ __kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x
                if(dst_rows_index < dst_rows_end)
                {
                     int local_row = (lX >> THREADS_PER_ROW_BIT) + i;
-                     int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j; 
+                     int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j;

-                     data = vload4(0, local_data+local_row * LOCAL_MEM_STEP + local_cols); 
+                     data = vload4(0, local_data+local_row * LOCAL_MEM_STEP + local_cols);
                     sum = sum + (mat_kernel[i * ANCHOR + j] * convert_int4_sat(data));
                 }
            }
@@ -205,17 +205,17 @@ __kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x
            sum.y = ((dst_cols_index + 1 >= dst_cols_start) && (dst_cols_index + 1 < dst_cols_end)) ? sum.y : dst_data.y;
            sum.z = ((dst_cols_index + 2 >= dst_cols_start) && (dst_cols_index + 2 < dst_cols_end)) ? sum.z : dst_data.z;
            sum.w = ((dst_cols_index + 3 >= dst_cols_start) && (dst_cols_index + 3 < dst_cols_end)) ? sum.w : dst_data.w;
-            *((__global uchar4 *)(dst + dst_rows_index * dst_step + dst_cols_index)) = convert_uchar4_sat(sum); 
+            *((__global uchar4 *)(dst + dst_rows_index * dst_step + dst_cols_index)) = convert_uchar4_sat(sum);
        }
   }
 }
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 /////////////////////////////////////////32FC1////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////////////////////////
-__kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x, int src_offset_y, 
-                             __global float *dst, int dst_step, int dst_offset_x, int dst_offset_y, 
+__kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x, int src_offset_y,
+                             __global float *dst, int dst_step, int dst_offset_x, int dst_offset_y,
                             __constant int *mat_kernel __attribute__((max_constant_size (16384))),
-                             int cols,int rows, int operate_cols, int wholecols, int wholerows) 
+                             int cols,int rows, int operate_cols, int wholecols, int wholerows)
 {
    int gX = get_global_id(0);
    int gY = get_global_id(1);
@@ -225,16 +225,16 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x
    int groupX_size = get_local_size(0);
    int groupX_id   = get_group_id(0);

-    #define dst_align (dst_offset_x & 3)     
-    int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX; 
-    int rows_start_index       = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY; 
-        
+    #define dst_align (dst_offset_x & 3)
+    int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX;
+    int rows_start_index       = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY;
+
    __local float local_data[LOCAL_MEM_STEP * ROWS_FETCH];
    if(((gY << 2) < rows))
    {
        for(int i = 0; i < ROWS_FETCH; ++i)
        {
-            if((rows_start_index - src_offset_y) + i < rows + ANY)  
+            if((rows_start_index - src_offset_y) + i < rows + ANY)
            {
                #ifdef BORDER_CONSTANT
                int selected_row  = rows_start_index + i;
@@ -243,7 +243,7 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x
                float data = *((__global float *)((__global char *)src + selected_row * src_step + (selected_cols << 2)));
                int con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
                data = con ? data : 0;
-                local_data[i * LOCAL_MEM_STEP + lX ] =data; 
+                local_data[i * LOCAL_MEM_STEP + lX ] =data;

                if(lX < (ANX << 1))
                {
@@ -252,7 +252,7 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x
                    data = *((__global float *)((__global char *)src + selected_row * src_step + (selected_cols << 2)));
                    con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
                    data = con ? data : 0;
-                    local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; 
+                    local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
                }
                #else
                int selected_row = ADDR_H(rows_start_index + i,  0, wholerows);
@@ -262,7 +262,7 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x
                selected_cols     = ADDR_R(cols_start_index_group + lX, wholecols, selected_cols);

                float data = *((__global float *)((__global char *)src + selected_row * src_step + (selected_cols << 2)));
-                local_data[i * LOCAL_MEM_STEP + lX] =data; 
+                local_data[i * LOCAL_MEM_STEP + lX] =data;

                if(lX < (ANX << 1))
                {
@@ -270,7 +270,7 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x
                    selected_cols = ADDR_R(selected_cols, wholecols, selected_cols);

                    data = *((__global float *)((__global char *)src + selected_row * src_step + (selected_cols << 2)));
-                    local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; 
+                    local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
                }
                #endif
            }
@@ -281,9 +281,9 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x
    int process_col = groupX_size * groupX_id + ((lX % THREADS_PER_ROW) << 2);
    if(((gY << 2) < rows) && (process_col < operate_cols))
    {
-        int dst_cols_start = dst_offset_x; 
+        int dst_cols_start = dst_offset_x;
        int dst_cols_end   = dst_offset_x + cols;
-        int dst_cols_index = (dst_offset_x + process_col) & 0xfffffffc;  
+        int dst_cols_index = (dst_offset_x + process_col) & 0xfffffffc;

        int dst_rows_end   = dst_offset_y + rows;
        int dst_rows_index = dst_offset_y + (gY << ROWS_PER_GROUP_BITS) + (lX >> THREADS_PER_ROW_BIT);
@@ -301,9 +301,9 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x
                if(dst_rows_index < dst_rows_end)
                {
                     int local_row = (lX >> THREADS_PER_ROW_BIT) + i;
-                     int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j; 
+                     int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j;

-                     data = vload4(0, local_data+local_row * LOCAL_MEM_STEP + local_cols); 
+                     data = vload4(0, local_data+local_row * LOCAL_MEM_STEP + local_cols);
                     sum = sum + (mat_kernel[i * ANCHOR + j] * data);
                 }
            }
@@ -316,7 +316,7 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x
            sum.z = ((dst_cols_index + 2 >= dst_cols_start) && (dst_cols_index + 2 < dst_cols_end)) ? sum.z : dst_data.z;
            sum.w = ((dst_cols_index + 3 >= dst_cols_start) && (dst_cols_index + 3 < dst_cols_end)) ? sum.w : dst_data.w;

-            *((__global float4 *)((__global char *)dst + dst_rows_index * dst_step + (dst_cols_index << 2))) = sum; 
+            *((__global float4 *)((__global char *)dst + dst_rows_index * dst_step + (dst_cols_index << 2))) = sum;
        }
   }
 }
@@ -324,10 +324,10 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 /////////////////////////////////////////8uC4////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////////////////////////
-__kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_x, int src_offset_y, 
-                             __global uchar4 *dst, int dst_step, int dst_offset_x, int dst_offset_y, 
+__kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_x, int src_offset_y,
+                             __global uchar4 *dst, int dst_step, int dst_offset_x, int dst_offset_y,
                             __constant int *mat_kernel __attribute__((max_constant_size (16384))),
-                             int cols,int rows, int operate_cols, int wholecols, int wholerows) 
+                             int cols,int rows, int operate_cols, int wholecols, int wholerows)
 {
    int gX = get_global_id(0);
    int gY = get_global_id(1);
@@ -337,17 +337,17 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
    int groupX_size = get_local_size(0);
    int groupX_id   = get_group_id(0);

-    #define dst_align (dst_offset_x & 3)     
-    int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX; 
-    int rows_start_index       = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY; 
-        
+    #define dst_align (dst_offset_x & 3)
+    int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX;
+    int rows_start_index       = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY;
+
    __local uchar4 local_data[LOCAL_MEM_STEP * ROWS_FETCH];
-        
+
    if(((gY << 2) < rows))
    {
        for(int i = 0; i < ROWS_FETCH; ++i)
        {
-            if((rows_start_index - src_offset_y) + i < rows + ANY)  
+            if((rows_start_index - src_offset_y) + i < rows + ANY)
            {
                #ifdef BORDER_CONSTANT
                int selected_row  = rows_start_index + i;
@@ -356,7 +356,7 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
                uchar4 data = *((__global uchar4*)((__global char*)src + selected_row * src_step + (selected_cols << 2)));
                int con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
                data = con ? data : 0;
-                local_data[i * LOCAL_MEM_STEP + lX ] =data; 
+                local_data[i * LOCAL_MEM_STEP + lX ] =data;

                if(lX < (ANX << 1))
                {
@@ -365,7 +365,7 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
                    data = *((__global uchar4*)((__global char*)src + selected_row * src_step + (selected_cols << 2)));
                    con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
                    data = con ? data : 0;
-                    local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; 
+                    local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
                }
                #else
                int selected_row = ADDR_H(rows_start_index + i,  0, wholerows);
@@ -376,7 +376,7 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_

                uchar4 data = *((__global uchar4*)((__global char*)src + selected_row * src_step + (selected_cols << 2)));

-                local_data[i * LOCAL_MEM_STEP + lX] =data; 
+                local_data[i * LOCAL_MEM_STEP + lX] =data;

                if(lX < (ANX << 1))
                {
@@ -384,7 +384,7 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
                    selected_cols = ADDR_R(selected_cols, wholecols, selected_cols);

                    data = *((__global uchar4*)((__global char*)src + selected_row * src_step + (selected_cols << 2)));
-                    local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; 
+                    local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
                }
                #endif
            }
@@ -395,9 +395,9 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
    int process_col = groupX_size * groupX_id + ((lX % THREADS_PER_ROW) << 2);
    if(((gY << 2) < rows) && (process_col < operate_cols))
    {
-        int dst_cols_start = dst_offset_x; 
+        int dst_cols_start = dst_offset_x;
        int dst_cols_end   = dst_offset_x + cols;
-        int dst_cols_index = (dst_offset_x + process_col) & 0xfffffffc;  
+        int dst_cols_index = (dst_offset_x + process_col) & 0xfffffffc;

        int dst_rows_end   = dst_offset_y + rows;
        int dst_rows_index = dst_offset_y + (gY << ROWS_PER_GROUP_BITS) + (lX >> THREADS_PER_ROW_BIT);
@@ -416,9 +416,9 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
                if(dst_rows_index < dst_rows_end)
                {
                     int local_row = (lX >> THREADS_PER_ROW_BIT) + i;
-                     int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j; 
+                     int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j;

-                     data = vload16(0, (__local uchar *)(local_data+local_row * LOCAL_MEM_STEP + local_cols)); 
+                     data = vload16(0, (__local uchar *)(local_data+local_row * LOCAL_MEM_STEP + local_cols));
                     sum = sum + (mat_kernel[i * ANCHOR + j] * convert_int16_sat(data));
                 }
            }
@@ -427,16 +427,16 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
        if(dst_rows_index < dst_rows_end)
        {
            uchar16 sum1 = convert_uchar16_sat(sum);
-            sum1.s0123 = ((dst_cols_index + 0 >= dst_cols_start) && (dst_cols_index + 0 < dst_cols_end))?  
+            sum1.s0123 = ((dst_cols_index + 0 >= dst_cols_start) && (dst_cols_index + 0 < dst_cols_end))?
                         sum1.s0123 : dst_data.s0123;
-            sum1.s4567 = ((dst_cols_index + 1 >= dst_cols_start) && (dst_cols_index + 1 < dst_cols_end))? 
+            sum1.s4567 = ((dst_cols_index + 1 >= dst_cols_start) && (dst_cols_index + 1 < dst_cols_end))?
                         sum1.s4567 : dst_data.s4567;
-            sum1.s89ab = ((dst_cols_index + 2 >= dst_cols_start) && (dst_cols_index + 2 < dst_cols_end))? 
+            sum1.s89ab = ((dst_cols_index + 2 >= dst_cols_start) && (dst_cols_index + 2 < dst_cols_end))?
                         sum1.s89ab : dst_data.s89ab;
-            sum1.scdef = ((dst_cols_index + 3 >= dst_cols_start) && (dst_cols_index + 3 < dst_cols_end))? 
+            sum1.scdef = ((dst_cols_index + 3 >= dst_cols_start) && (dst_cols_index + 3 < dst_cols_end))?
                         sum1.scdef : dst_data.scdef;

-            *((__global uchar16*)((__global char *)dst + dst_rows_index * dst_step + (dst_cols_index << 2))) = sum1; 
+            *((__global uchar16*)((__global char *)dst + dst_rows_index * dst_step + (dst_cols_index << 2))) = sum1;
        }
    }
 }
@@ -445,10 +445,10 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 #define ROWS_FETCH_C4              (1 + ANY + ANY)   //(ROWS_PER_GROUP + anY * 2)
 #define LOCAL_MEM_STEP_C4           260 //divup((get_local_size(0) + anX * 2), 4) * 4)
-__kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_x, int src_offset_y, 
-                             __global float4 *dst, int dst_step, int dst_offset_x, int dst_offset_y, 
+__kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_x, int src_offset_y,
+                             __global float4 *dst, int dst_step, int dst_offset_x, int dst_offset_y,
                             __constant int *mat_kernel __attribute__((max_constant_size (16384))),
-                             int cols,int rows, int operate_cols, int wholecols, int wholerows) 
+                             int cols,int rows, int operate_cols, int wholecols, int wholerows)
 {
    int gX = get_global_id(0);
    int gY = get_global_id(1);
@@ -458,15 +458,15 @@ __kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_
    int groupX_size = get_local_size(0);
    int groupX_id   = get_group_id(0);

-    int cols_start_index_group = src_offset_x + groupX_size * groupX_id - ANX; 
-    int rows_start_index       = src_offset_y + gY - ANY; 
-        
+    int cols_start_index_group = src_offset_x + groupX_size * groupX_id - ANX;
+    int rows_start_index       = src_offset_y + gY - ANY;
+
    __local float4 local_data[LOCAL_MEM_STEP_C4 * ROWS_FETCH_C4];
    if((gY < rows) && (gX < (operate_cols + ANX + ANX)))
    {
        for(int i = 0; i < ROWS_FETCH_C4; ++i)
        {
-            if((rows_start_index - src_offset_y) + i < rows + ANY)  
+            if((rows_start_index - src_offset_y) + i < rows + ANY)
            {
                #ifdef BORDER_CONSTANT
                int selected_row  = rows_start_index + i;
@@ -475,7 +475,7 @@ __kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_
                float4 data = *((__global float4*)((__global char*)src + selected_row * src_step + (selected_cols << 4)));
                int con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
                data = con ? data : 0;
-                local_data[i * LOCAL_MEM_STEP + lX ] =data; 
+                local_data[i * LOCAL_MEM_STEP + lX ] =data;

                if(lX < (ANX << 1))
                {
@@ -484,7 +484,7 @@ __kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_
                    data = *((__global float4*)((__global char*)src + selected_row * src_step + (selected_cols << 4)));
                    con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
                    data = con ? data : 0;
-                    local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; 
+                    local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
                }
                #else
                int selected_row = ADDR_H(rows_start_index + i,  0, wholerows);
@@ -494,7 +494,7 @@ __kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_
                selected_cols     = ADDR_R(cols_start_index_group + lX, wholecols, selected_cols);

                float4 data = *((__global float4*)((__global char*)src + selected_row * src_step + (selected_cols << 4)));
-                local_data[i * LOCAL_MEM_STEP_C4 + lX] =data; 
+                local_data[i * LOCAL_MEM_STEP_C4 + lX] =data;

                if(lX < (ANX << 1))
                {
@@ -502,7 +502,7 @@ __kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_
                    selected_cols = ADDR_R(selected_cols, wholecols, selected_cols);

                    data = *((__global float4*)((__global char*)src + selected_row * src_step + (selected_cols << 4)));
-                    local_data[i * LOCAL_MEM_STEP_C4 + lX + groupX_size] =data; 
+                    local_data[i * LOCAL_MEM_STEP_C4 + lX + groupX_size] =data;
                }
                #endif
            }
@@ -512,7 +512,7 @@ __kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_

    if((gY < rows) && (gX < operate_cols))
    {
-        int dst_cols_index = dst_offset_x + gX;  
+        int dst_cols_index = dst_offset_x + gX;
        int dst_rows_index = dst_offset_y + gY;

        float4 sum = (float4)(0);
@@ -521,11 +521,11 @@ __kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_
        {
           for(int j = 0; j < ANCHOR; j++)
           {
-               int local_cols = lX + j; 
+               int local_cols = lX + j;
               sum = sum + mat_kernel[i * ANCHOR + j] * local_data[i * LOCAL_MEM_STEP_C4 + local_cols];
            }
        }

-        *((__global float4*)((__global char *)dst + dst_rows_index * dst_step + (dst_cols_index << 4))) = sum; 
+        *((__global float4*)((__global char *)dst + dst_rows_index * dst_step + (dst_cols_index << 4))) = sum;
    }
 }
--- a/modules/ocl/src/kernels/filtering_morph.cl
+++ b/modules/ocl/src/kernels/filtering_morph.cl
@@ -45,160 +45,160 @@
 #define ELEM(i,l_edge,r_edge,elem1,elem2) (i)<(l_edge) | (i) >= (r_edge) ? (elem1) : (elem2)
 #ifndef GENTYPE
 __kernel void morph_C1_D0(__global const uchar * restrict src,
-						  __global uchar *dst, 
-						  int src_offset_x, int src_offset_y, 
-						  int cols, int rows, 
-					      int src_step_in_pixel, int dst_step_in_pixel, 
-						  __constant uchar * mat_kernel, 
-						  int src_whole_cols, int src_whole_rows,
-						  int dst_offset_in_pixel)
+                          __global uchar *dst,
+                          int src_offset_x, int src_offset_y,
+                          int cols, int rows,
+                          int src_step_in_pixel, int dst_step_in_pixel,
+                          __constant uchar * mat_kernel,
+                          int src_whole_cols, int src_whole_rows,
+                          int dst_offset_in_pixel)
 {
-	int l_x = get_local_id(0);
-	int l_y = get_local_id(1);
-	int x = get_group_id(0)*4*LSIZE0;
-	int y = get_group_id(1)*LSIZE1;
-	int start_x = x+src_offset_x-RADIUSX & 0xfffffffc;
-	int end_x = x + src_offset_x+LSIZE0*4+RADIUSX & 0xfffffffc;
-	int width = (end_x -start_x+4)>>2;
-	int offset = src_offset_x-RADIUSX & 3;
-	int start_y = y+src_offset_y-RADIUSY;
-	int point1 = mad24(l_y,LSIZE0,l_x);
-	int point2 = point1 + LSIZE0*LSIZE1;
-	int tl_x = (point1 % width)<<2;
-	int tl_y = point1 / width;
-	int tl_x2 = (point2 % width)<<2;
-	int tl_y2 = point2 / width;
-	int cur_x = start_x + tl_x;
-	int cur_y = start_y + tl_y;
-	int cur_x2 = start_x + tl_x2;
-	int cur_y2 = start_y + tl_y2;
-	int start_addr = mad24(cur_y,src_step_in_pixel,cur_x);
-	int start_addr2 = mad24(cur_y2,src_step_in_pixel,cur_x2);
-	uchar4 temp0,temp1;
-	__local uchar4 LDS_DAT[2*LSIZE1*LSIZE0];
+    int l_x = get_local_id(0);
+    int l_y = get_local_id(1);
+    int x = get_group_id(0)*4*LSIZE0;
+    int y = get_group_id(1)*LSIZE1;
+    int start_x = x+src_offset_x-RADIUSX & 0xfffffffc;
+    int end_x = x + src_offset_x+LSIZE0*4+RADIUSX & 0xfffffffc;
+    int width = (end_x -start_x+4)>>2;
+    int offset = src_offset_x-RADIUSX & 3;
+    int start_y = y+src_offset_y-RADIUSY;
+    int point1 = mad24(l_y,LSIZE0,l_x);
+    int point2 = point1 + LSIZE0*LSIZE1;
+    int tl_x = (point1 % width)<<2;
+    int tl_y = point1 / width;
+    int tl_x2 = (point2 % width)<<2;
+    int tl_y2 = point2 / width;
+    int cur_x = start_x + tl_x;
+    int cur_y = start_y + tl_y;
+    int cur_x2 = start_x + tl_x2;
+    int cur_y2 = start_y + tl_y2;
+    int start_addr = mad24(cur_y,src_step_in_pixel,cur_x);
+    int start_addr2 = mad24(cur_y2,src_step_in_pixel,cur_x2);
+    uchar4 temp0,temp1;
+    __local uchar4 LDS_DAT[2*LSIZE1*LSIZE0];

-	int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
-	//read pixels from src
-	start_addr = ((start_addr < end_addr) && (start_addr > 0)) ? start_addr : 0;
-	start_addr2 = ((start_addr2 < end_addr) && (start_addr2 > 0)) ? start_addr2 : 0;
-	temp0 = *(__global uchar4*)&src[start_addr];
-	temp1 = *(__global uchar4*)&src[start_addr2];
-	//judge if read out of boundary
-	temp0.x= ELEM(cur_x,0,src_whole_cols,VAL,temp0.x);
-	temp0.y= ELEM(cur_x+1,0,src_whole_cols,VAL,temp0.y);
-	temp0.z= ELEM(cur_x+2,0,src_whole_cols,VAL,temp0.z);
-	temp0.w= ELEM(cur_x+3,0,src_whole_cols,VAL,temp0.w);
-	temp0= ELEM(cur_y,0,src_whole_rows,(uchar4)VAL,temp0);
+    int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
+    //read pixels from src
+    start_addr = ((start_addr < end_addr) && (start_addr > 0)) ? start_addr : 0;
+    start_addr2 = ((start_addr2 < end_addr) && (start_addr2 > 0)) ? start_addr2 : 0;
+    temp0 = *(__global uchar4*)&src[start_addr];
+    temp1 = *(__global uchar4*)&src[start_addr2];
+    //judge if read out of boundary
+    temp0.x= ELEM(cur_x,0,src_whole_cols,VAL,temp0.x);
+    temp0.y= ELEM(cur_x+1,0,src_whole_cols,VAL,temp0.y);
+    temp0.z= ELEM(cur_x+2,0,src_whole_cols,VAL,temp0.z);
+    temp0.w= ELEM(cur_x+3,0,src_whole_cols,VAL,temp0.w);
+    temp0= ELEM(cur_y,0,src_whole_rows,(uchar4)VAL,temp0);

-	temp1.x= ELEM(cur_x2,0,src_whole_cols,VAL,temp1.x);
-	temp1.y= ELEM(cur_x2+1,0,src_whole_cols,VAL,temp1.y);
-	temp1.z= ELEM(cur_x2+2,0,src_whole_cols,VAL,temp1.z);
-	temp1.w= ELEM(cur_x2+3,0,src_whole_cols,VAL,temp1.w);
-	temp1= ELEM(cur_y2,0,src_whole_rows,(uchar4)VAL,temp1);
+    temp1.x= ELEM(cur_x2,0,src_whole_cols,VAL,temp1.x);
+    temp1.y= ELEM(cur_x2+1,0,src_whole_cols,VAL,temp1.y);
+    temp1.z= ELEM(cur_x2+2,0,src_whole_cols,VAL,temp1.z);
+    temp1.w= ELEM(cur_x2+3,0,src_whole_cols,VAL,temp1.w);
+    temp1= ELEM(cur_y2,0,src_whole_rows,(uchar4)VAL,temp1);

-	LDS_DAT[point1] = temp0;
-	LDS_DAT[point2] = temp1;
-	barrier(CLK_LOCAL_MEM_FENCE);
-	uchar4 res = (uchar4)VAL;
-	for(int i=0;i<2*RADIUSY+1;i++)
-		for(int j=0;j<2*RADIUSX+1;j++)
-		{
-			res =mat_kernel[i*(2*RADIUSX+1)+j]? MORPH_OP(res,vload4(0,(__local uchar*)&LDS_DAT[mad24((l_y+i),width,l_x)]+offset+j)):res;
-		}
-	int gidx = get_global_id(0)<<2;
-	int gidy = get_global_id(1);
-	int out_addr = mad24(gidy,dst_step_in_pixel,gidx+dst_offset_in_pixel);
-	if(gidx+3<cols && gidy<rows && (dst_offset_in_pixel&3==0))
-	{
-		*(__global uchar4*)&dst[out_addr] = res;
-	}
-	else
-	{
-		if(gidx+3<cols && gidy<rows)
-		{
-			dst[out_addr] = res.x;
-			dst[out_addr+1] = res.y;
-			dst[out_addr+2] = res.z;
-			dst[out_addr+3] = res.w;
-		}	
-		else if(gidx+2<cols && gidy<rows)
-		{
-			dst[out_addr] = res.x;
-			dst[out_addr+1] = res.y;
-			dst[out_addr+2] = res.z;
-		}		
-		else if(gidx+1<cols && gidy<rows)
-		{
-			dst[out_addr] = res.x;
-			dst[out_addr+1] = res.y;
-		}		
-		else if(gidx<cols && gidy<rows)
-		{
-			dst[out_addr] = res.x;
-		}		
-	}
+    LDS_DAT[point1] = temp0;
+    LDS_DAT[point2] = temp1;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uchar4 res = (uchar4)VAL;
+    for(int i=0;i<2*RADIUSY+1;i++)
+        for(int j=0;j<2*RADIUSX+1;j++)
+        {
+            res =mat_kernel[i*(2*RADIUSX+1)+j]? MORPH_OP(res,vload4(0,(__local uchar*)&LDS_DAT[mad24((l_y+i),width,l_x)]+offset+j)):res;
+        }
+    int gidx = get_global_id(0)<<2;
+    int gidy = get_global_id(1);
+    int out_addr = mad24(gidy,dst_step_in_pixel,gidx+dst_offset_in_pixel);
+    if(gidx+3<cols && gidy<rows && (dst_offset_in_pixel&3==0))
+    {
+        *(__global uchar4*)&dst[out_addr] = res;
+    }
+    else
+    {
+        if(gidx+3<cols && gidy<rows)
+        {
+            dst[out_addr] = res.x;
+            dst[out_addr+1] = res.y;
+            dst[out_addr+2] = res.z;
+            dst[out_addr+3] = res.w;
+        }
+        else if(gidx+2<cols && gidy<rows)
+        {
+            dst[out_addr] = res.x;
+            dst[out_addr+1] = res.y;
+            dst[out_addr+2] = res.z;
+        }
+        else if(gidx+1<cols && gidy<rows)
+        {
+            dst[out_addr] = res.x;
+            dst[out_addr+1] = res.y;
+        }
+        else if(gidx<cols && gidy<rows)
+        {
+            dst[out_addr] = res.x;
+        }
+    }
 }
 #else
 __kernel void morph(__global const GENTYPE * restrict src,
-						  __global GENTYPE *dst, 
-						  int src_offset_x, int src_offset_y, 
-						  int cols, int rows, 
-					      int src_step_in_pixel, int dst_step_in_pixel, 
-						  __constant uchar * mat_kernel, 
-						  int src_whole_cols, int src_whole_rows,
-						  int dst_offset_in_pixel)
+                          __global GENTYPE *dst,
+                          int src_offset_x, int src_offset_y,
+                          int cols, int rows,
+                          int src_step_in_pixel, int dst_step_in_pixel,
+                          __constant uchar * mat_kernel,
+                          int src_whole_cols, int src_whole_rows,
+                          int dst_offset_in_pixel)
 {
-	int l_x = get_local_id(0);
-	int l_y = get_local_id(1);
-	int x = get_group_id(0)*LSIZE0;
-	int y = get_group_id(1)*LSIZE1;
-	int start_x = x+src_offset_x-RADIUSX;
-	int end_x = x + src_offset_x+LSIZE0+RADIUSX;
-	int width = end_x -start_x+1;
-	int start_y = y+src_offset_y-RADIUSY;
-	int point1 = mad24(l_y,LSIZE0,l_x);
-	int point2 = point1 + LSIZE0*LSIZE1;
-	int tl_x = point1 % width;
-	int tl_y = point1 / width;
-	int tl_x2 = point2 % width;
-	int tl_y2 = point2 / width;
-	int cur_x = start_x + tl_x;
-	int cur_y = start_y + tl_y;
-	int cur_x2 = start_x + tl_x2;
-	int cur_y2 = start_y + tl_y2;
-	int start_addr = mad24(cur_y,src_step_in_pixel,cur_x);
-	int start_addr2 = mad24(cur_y2,src_step_in_pixel,cur_x2);
-	GENTYPE temp0,temp1;
-	__local GENTYPE LDS_DAT[2*LSIZE1*LSIZE0];
+    int l_x = get_local_id(0);
+    int l_y = get_local_id(1);
+    int x = get_group_id(0)*LSIZE0;
+    int y = get_group_id(1)*LSIZE1;
+    int start_x = x+src_offset_x-RADIUSX;
+    int end_x = x + src_offset_x+LSIZE0+RADIUSX;
+    int width = end_x -start_x+1;
+    int start_y = y+src_offset_y-RADIUSY;
+    int point1 = mad24(l_y,LSIZE0,l_x);
+    int point2 = point1 + LSIZE0*LSIZE1;
+    int tl_x = point1 % width;
+    int tl_y = point1 / width;
+    int tl_x2 = point2 % width;
+    int tl_y2 = point2 / width;
+    int cur_x = start_x + tl_x;
+    int cur_y = start_y + tl_y;
+    int cur_x2 = start_x + tl_x2;
+    int cur_y2 = start_y + tl_y2;
+    int start_addr = mad24(cur_y,src_step_in_pixel,cur_x);
+    int start_addr2 = mad24(cur_y2,src_step_in_pixel,cur_x2);
+    GENTYPE temp0,temp1;
+    __local GENTYPE LDS_DAT[2*LSIZE1*LSIZE0];

-	int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
-	//read pixels from src
-	start_addr = ((start_addr < end_addr) && (start_addr > 0)) ? start_addr : 0;
-	start_addr2 = ((start_addr2 < end_addr) && (start_addr2 > 0)) ? start_addr2 : 0;
-	temp0 = src[start_addr];
-	temp1 = src[start_addr2];
-	//judge if read out of boundary
-	temp0= ELEM(cur_x,0,src_whole_cols,(GENTYPE)VAL,temp0);
-	temp0= ELEM(cur_y,0,src_whole_rows,(GENTYPE)VAL,temp0);
+    int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
+    //read pixels from src
+    start_addr = ((start_addr < end_addr) && (start_addr > 0)) ? start_addr : 0;
+    start_addr2 = ((start_addr2 < end_addr) && (start_addr2 > 0)) ? start_addr2 : 0;
+    temp0 = src[start_addr];
+    temp1 = src[start_addr2];
+    //judge if read out of boundary
+    temp0= ELEM(cur_x,0,src_whole_cols,(GENTYPE)VAL,temp0);
+    temp0= ELEM(cur_y,0,src_whole_rows,(GENTYPE)VAL,temp0);

-	temp1= ELEM(cur_x2,0,src_whole_cols,(GENTYPE)VAL,temp1);
-	temp1= ELEM(cur_y2,0,src_whole_rows,(GENTYPE)VAL,temp1);
+    temp1= ELEM(cur_x2,0,src_whole_cols,(GENTYPE)VAL,temp1);
+    temp1= ELEM(cur_y2,0,src_whole_rows,(GENTYPE)VAL,temp1);

-	LDS_DAT[point1] = temp0;
-	LDS_DAT[point2] = temp1;
-	barrier(CLK_LOCAL_MEM_FENCE);
-	GENTYPE res = (GENTYPE)VAL;
-	for(int i=0;i<2*RADIUSY+1;i++)
-		for(int j=0;j<2*RADIUSX+1;j++)
-		{
-			res =mat_kernel[i*(2*RADIUSX+1)+j]? MORPH_OP(res,LDS_DAT[mad24(l_y+i,width,l_x+j)]):res;
-		}
-	int gidx = get_global_id(0);
-	int gidy = get_global_id(1);
-	int out_addr = mad24(gidy,dst_step_in_pixel,gidx+dst_offset_in_pixel);
-	if(gidx<cols && gidy<rows)
-	{
-		dst[out_addr] = res;
-	}
+    LDS_DAT[point1] = temp0;
+    LDS_DAT[point2] = temp1;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    GENTYPE res = (GENTYPE)VAL;
+    for(int i=0;i<2*RADIUSY+1;i++)
+        for(int j=0;j<2*RADIUSX+1;j++)
+        {
+            res =mat_kernel[i*(2*RADIUSX+1)+j]? MORPH_OP(res,LDS_DAT[mad24(l_y+i,width,l_x+j)]):res;
+        }
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+    int out_addr = mad24(gidy,dst_step_in_pixel,gidx+dst_offset_in_pixel);
+    if(gidx<cols && gidy<rows)
+    {
+        dst[out_addr] = res;
+    }
 }
 #endif
--- a/modules/ocl/src/kernels/haarobjectdetect.cl
+++ b/modules/ocl/src/kernels/haarobjectdetect.cl
@@ -46,365 +46,365 @@ typedef float sqsumtype;

 typedef struct  __attribute__((aligned (128)))  GpuHidHaarFeature
 {
-	struct __attribute__((aligned (32)))
-	{
-		int p0 __attribute__((aligned (4))); 
-		int p1 __attribute__((aligned (4))); 
-		int p2 __attribute__((aligned (4))); 
-		int p3 __attribute__((aligned (4)));
-		float weight __attribute__((aligned (4)));
-	}
-	rect[CV_HAAR_FEATURE_MAX] __attribute__((aligned (32)));
+    struct __attribute__((aligned (32)))
+    {
+        int p0 __attribute__((aligned (4)));
+        int p1 __attribute__((aligned (4)));
+        int p2 __attribute__((aligned (4)));
+        int p3 __attribute__((aligned (4)));
+        float weight __attribute__((aligned (4)));
+    }
+    rect[CV_HAAR_FEATURE_MAX] __attribute__((aligned (32)));
 }
 GpuHidHaarFeature;


 typedef struct __attribute__((aligned (128) )) GpuHidHaarTreeNode
 {
-	int p[CV_HAAR_FEATURE_MAX][4] __attribute__((aligned (64)));
-	float weight[CV_HAAR_FEATURE_MAX] /*__attribute__((aligned (16)))*/;
-	float threshold /*__attribute__((aligned (4)))*/;
-	float alpha[2] __attribute__((aligned (8)));
-	int left __attribute__((aligned (4)));
-	int right __attribute__((aligned (4)));
+    int p[CV_HAAR_FEATURE_MAX][4] __attribute__((aligned (64)));
+    float weight[CV_HAAR_FEATURE_MAX] /*__attribute__((aligned (16)))*/;
+    float threshold /*__attribute__((aligned (4)))*/;
+    float alpha[2] __attribute__((aligned (8)));
+    int left __attribute__((aligned (4)));
+    int right __attribute__((aligned (4)));
 }
 GpuHidHaarTreeNode;


 typedef struct __attribute__((aligned (32))) GpuHidHaarClassifier
 {
-	int count __attribute__((aligned (4)));
-	GpuHidHaarTreeNode* node __attribute__((aligned (8)));
-	float* alpha __attribute__((aligned (8)));
+    int count __attribute__((aligned (4)));
+    GpuHidHaarTreeNode* node __attribute__((aligned (8)));
+    float* alpha __attribute__((aligned (8)));
 }
 GpuHidHaarClassifier;


 typedef struct __attribute__((aligned (64))) GpuHidHaarStageClassifier
 {
-	int  count __attribute__((aligned (4)));
-	float threshold __attribute__((aligned (4)));
-	int two_rects __attribute__((aligned (4)));
-	int reserved0 __attribute__((aligned (8)));
-	int reserved1 __attribute__((aligned (8)));
-	int reserved2 __attribute__((aligned (8)));
-	int reserved3 __attribute__((aligned (8)));
+    int  count __attribute__((aligned (4)));
+    float threshold __attribute__((aligned (4)));
+    int two_rects __attribute__((aligned (4)));
+    int reserved0 __attribute__((aligned (8)));
+    int reserved1 __attribute__((aligned (8)));
+    int reserved2 __attribute__((aligned (8)));
+    int reserved3 __attribute__((aligned (8)));
 }
 GpuHidHaarStageClassifier;


 typedef struct __attribute__((aligned (64))) GpuHidHaarClassifierCascade
 {
-	int  count __attribute__((aligned (4)));
-	int  is_stump_based __attribute__((aligned (4)));
-	int  has_tilted_features __attribute__((aligned (4)));
-	int  is_tree __attribute__((aligned (4)));
-	int pq0 __attribute__((aligned (4))); 
-	int pq1 __attribute__((aligned (4)));
-	int pq2 __attribute__((aligned (4)));
-	int pq3 __attribute__((aligned (4)));
-	int p0 __attribute__((aligned (4)));
-	int p1 __attribute__((aligned (4))); 
-	int p2 __attribute__((aligned (4))); 
-	int p3 __attribute__((aligned (4)));
-	float inv_window_area __attribute__((aligned (4)));
+    int  count __attribute__((aligned (4)));
+    int  is_stump_based __attribute__((aligned (4)));
+    int  has_tilted_features __attribute__((aligned (4)));
+    int  is_tree __attribute__((aligned (4)));
+    int pq0 __attribute__((aligned (4)));
+    int pq1 __attribute__((aligned (4)));
+    int pq2 __attribute__((aligned (4)));
+    int pq3 __attribute__((aligned (4)));
+    int p0 __attribute__((aligned (4)));
+    int p1 __attribute__((aligned (4)));
+    int p2 __attribute__((aligned (4)));
+    int p3 __attribute__((aligned (4)));
+    float inv_window_area __attribute__((aligned (4)));
 }GpuHidHaarClassifierCascade;


 __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCascade(//constant GpuHidHaarClassifierCascade * cascade,
-										  global GpuHidHaarStageClassifier * stagecascadeptr,
-										  global int4 * info,    
-										  global GpuHidHaarTreeNode * nodeptr,
-										  global const int * restrict sum1, 
-										  global const float * restrict sqsum1, 
-										  global int4 * candidate,
-										  const int pixelstep,
-										  const int loopcount,
-										  const int start_stage, 
-										  const int split_stage,
-										  const int end_stage,
-										  const int startnode,
-										  const int splitnode,
-										  const int4 p, 
-										  const int4 pq, 
-										  const float correction 
-										  //const int width, 
-										  //const int height,
-										  //const int grpnumperline,
-										  //const int totalgrp
-										  )
+                                          global GpuHidHaarStageClassifier * stagecascadeptr,
+                                          global int4 * info,
+                                          global GpuHidHaarTreeNode * nodeptr,
+                                          global const int * restrict sum1,
+                                          global const float * restrict sqsum1,
+                                          global int4 * candidate,
+                                          const int pixelstep,
+                                          const int loopcount,
+                                          const int start_stage,
+                                          const int split_stage,
+                                          const int end_stage,
+                                          const int startnode,
+                                          const int splitnode,
+                                          const int4 p,
+                                          const int4 pq,
+                                          const float correction
+                                          //const int width,
+                                          //const int height,
+                                          //const int grpnumperline,
+                                          //const int totalgrp
+                                          )
 {
-	int grpszx = get_local_size(0);
-	int grpszy = get_local_size(1);
-	int grpnumx = get_num_groups(0);
-	int grpidx = get_group_id(0);
-	int lclidx = get_local_id(0);
-	int lclidy = get_local_id(1);
+    int grpszx = get_local_size(0);
+    int grpszy = get_local_size(1);
+    int grpnumx = get_num_groups(0);
+    int grpidx = get_group_id(0);
+    int lclidx = get_local_id(0);
+    int lclidy = get_local_id(1);

-	int lcl_sz = mul24(grpszx,grpszy);
-	int lcl_id = mad24(lclidy,grpszx,lclidx);
+    int lcl_sz = mul24(grpszx,grpszy);
+    int lcl_id = mad24(lclidy,grpszx,lclidx);

-	//assume lcl_sz == 256 or 128 or 64
-	//int lcl_sz_shift = (lcl_sz == 256) ? 8 : 7;
-	//lcl_sz_shift = (lcl_sz == 64) ? 6 : lcl_sz_shift;
-	__local int lclshare[1024];
+    //assume lcl_sz == 256 or 128 or 64
+    //int lcl_sz_shift = (lcl_sz == 256) ? 8 : 7;
+    //lcl_sz_shift = (lcl_sz == 64) ? 6 : lcl_sz_shift;
+    __local int lclshare[1024];

 #define OFF 0
-	__local int* lcldata = lclshare + OFF;//for save win data
-	__local int* glboutindex = lcldata + 28*28;//for save global out index 
-	__local int* lclcount = glboutindex + 1;//for save the numuber of temp pass pixel
-	__local int* lcloutindex = lclcount + 1;//for save info of temp pass pixel
-	__local float* partialsum = (__local float*)(lcloutindex + (lcl_sz<<1));
-	glboutindex[0]=0;
-	int outputoff = mul24(grpidx,256);
+    __local int* lcldata = lclshare + OFF;//for save win data
+    __local int* glboutindex = lcldata + 28*28;//for save global out index
+    __local int* lclcount = glboutindex + 1;//for save the numuber of temp pass pixel
+    __local int* lcloutindex = lclcount + 1;//for save info of temp pass pixel
+    __local float* partialsum = (__local float*)(lcloutindex + (lcl_sz<<1));
+    glboutindex[0]=0;
+    int outputoff = mul24(grpidx,256);

-	//assume window size is 20X20
+    //assume window size is 20X20
 #define WINDOWSIZE 20+1
-	//make sure readwidth is the multiple of 4
-	//ystep =1, from host code
-	int readwidth = ((grpszx-1 + WINDOWSIZE+3)>>2)<<2;
-	int readheight = grpszy-1+WINDOWSIZE;
-	int read_horiz_cnt = readwidth >> 2;//each read int4
-	int total_read = mul24(read_horiz_cnt,readheight);
-	int read_loop = (total_read + lcl_sz - 1) >> 6;
-	candidate[outputoff+(lcl_id<<2)] = (int4)0;
-	candidate[outputoff+(lcl_id<<2)+1] = (int4)0;
-	candidate[outputoff+(lcl_id<<2)+2] = (int4)0;
-	candidate[outputoff+(lcl_id<<2)+3] = (int4)0;
-	for(int scalei = 0; scalei <loopcount; scalei++)
-	{
-		int4 scaleinfo1= info[scalei];
-		int width = (scaleinfo1.x & 0xffff0000) >> 16;
-		int height = scaleinfo1.x & 0xffff;
-		int grpnumperline =(scaleinfo1.y & 0xffff0000) >> 16;
-		int totalgrp = scaleinfo1.y & 0xffff;
-		int imgoff = scaleinfo1.z;
-		float factor = as_float(scaleinfo1.w);
-		//int ystep =1;// factor > 2.0 ? 1 : 2;
+    //make sure readwidth is the multiple of 4
+    //ystep =1, from host code
+    int readwidth = ((grpszx-1 + WINDOWSIZE+3)>>2)<<2;
+    int readheight = grpszy-1+WINDOWSIZE;
+    int read_horiz_cnt = readwidth >> 2;//each read int4
+    int total_read = mul24(read_horiz_cnt,readheight);
+    int read_loop = (total_read + lcl_sz - 1) >> 6;
+    candidate[outputoff+(lcl_id<<2)] = (int4)0;
+    candidate[outputoff+(lcl_id<<2)+1] = (int4)0;
+    candidate[outputoff+(lcl_id<<2)+2] = (int4)0;
+    candidate[outputoff+(lcl_id<<2)+3] = (int4)0;
+    for(int scalei = 0; scalei <loopcount; scalei++)
+    {
+        int4 scaleinfo1= info[scalei];
+        int width = (scaleinfo1.x & 0xffff0000) >> 16;
+        int height = scaleinfo1.x & 0xffff;
+        int grpnumperline =(scaleinfo1.y & 0xffff0000) >> 16;
+        int totalgrp = scaleinfo1.y & 0xffff;
+        int imgoff = scaleinfo1.z;
+        float factor = as_float(scaleinfo1.w);
+        //int ystep =1;// factor > 2.0 ? 1 : 2;

-		__global const int * sum = sum1 + imgoff;
-		__global const float * sqsum = sqsum1 + imgoff;
-		for(int grploop=grpidx;grploop<totalgrp;grploop+=grpnumx)
-		{
-			int grpidy = grploop / grpnumperline;
-			int grpidx = grploop - mul24(grpidy, grpnumperline);
-			int x = mad24(grpidx,grpszx,lclidx);
-			int y = mad24(grpidy,grpszy,lclidy);
-			//candidate_result.x = convert_int_rtn(x*factor);
-			//candidate_result.y = convert_int_rtn(y*factor);
-			int grpoffx = x-lclidx;
-			int grpoffy = y-lclidy;
+        __global const int * sum = sum1 + imgoff;
+        __global const float * sqsum = sqsum1 + imgoff;
+        for(int grploop=grpidx;grploop<totalgrp;grploop+=grpnumx)
+        {
+            int grpidy = grploop / grpnumperline;
+            int grpidx = grploop - mul24(grpidy, grpnumperline);
+            int x = mad24(grpidx,grpszx,lclidx);
+            int y = mad24(grpidy,grpszy,lclidy);
+            //candidate_result.x = convert_int_rtn(x*factor);
+            //candidate_result.y = convert_int_rtn(y*factor);
+            int grpoffx = x-lclidx;
+            int grpoffy = y-lclidy;

-			for(int i=0;i<read_loop;i++)
-			{
-				int pos_id = mad24(i,lcl_sz,lcl_id);
-				pos_id = pos_id < total_read ? pos_id : 0;
+            for(int i=0;i<read_loop;i++)
+            {
+                int pos_id = mad24(i,lcl_sz,lcl_id);
+                pos_id = pos_id < total_read ? pos_id : 0;

-				int lcl_y = pos_id / read_horiz_cnt;
-				int lcl_x = pos_id - mul24(lcl_y, read_horiz_cnt);
+                int lcl_y = pos_id / read_horiz_cnt;
+                int lcl_x = pos_id - mul24(lcl_y, read_horiz_cnt);

-				int glb_x = grpoffx + (lcl_x<<2);
-				int glb_y = grpoffy + lcl_y;
+                int glb_x = grpoffx + (lcl_x<<2);
+                int glb_y = grpoffy + lcl_y;

-				int glb_off = mad24(glb_y,pixelstep,glb_x);
-				int4 data = *(__global int4*)&sum[glb_off];
-				int lcl_off = mad24(lcl_y, readwidth, lcl_x<<2);
+                int glb_off = mad24(glb_y,pixelstep,glb_x);
+                int4 data = *(__global int4*)&sum[glb_off];
+                int lcl_off = mad24(lcl_y, readwidth, lcl_x<<2);

-				lcldata[lcl_off] = data.x;
-				lcldata[lcl_off+1] = data.y;
-				lcldata[lcl_off+2] = data.z;
-				lcldata[lcl_off+3] = data.w;
-			}
+                lcldata[lcl_off] = data.x;
+                lcldata[lcl_off+1] = data.y;
+                lcldata[lcl_off+2] = data.z;
+                lcldata[lcl_off+3] = data.w;
+            }

-			lcloutindex[lcl_id] = 0;
-			lclcount[0] = 0;
-			int result = 1;
-			int nodecounter= startnode;
-			float mean, variance_norm_factor;
-			barrier(CLK_LOCAL_MEM_FENCE);
+            lcloutindex[lcl_id] = 0;
+            lclcount[0] = 0;
+            int result = 1;
+            int nodecounter= startnode;
+            float mean, variance_norm_factor;
+            barrier(CLK_LOCAL_MEM_FENCE);

-			int lcl_off = mad24(lclidy,readwidth,lclidx);
-			int4 cascadeinfo1, cascadeinfo2;
-			cascadeinfo1 = p;
-			cascadeinfo2 = pq;// + mad24(y, pixelstep, x);
+            int lcl_off = mad24(lclidy,readwidth,lclidx);
+            int4 cascadeinfo1, cascadeinfo2;
+            cascadeinfo1 = p;
+            cascadeinfo2 = pq;// + mad24(y, pixelstep, x);


-			//if((x < width) && (y < height))
-			{
-				cascadeinfo1.x +=lcl_off;
-				cascadeinfo1.z +=lcl_off;
-				mean = (lcldata[mad24(cascadeinfo1.y,readwidth,cascadeinfo1.x)] - lcldata[mad24(cascadeinfo1.y,readwidth,cascadeinfo1.z)] - 
-					lcldata[mad24(cascadeinfo1.w,readwidth,cascadeinfo1.x)] + lcldata[mad24(cascadeinfo1.w,readwidth,cascadeinfo1.z)])
-					*correction;
+            //if((x < width) && (y < height))
+            {
+                cascadeinfo1.x +=lcl_off;
+                cascadeinfo1.z +=lcl_off;
+                mean = (lcldata[mad24(cascadeinfo1.y,readwidth,cascadeinfo1.x)] - lcldata[mad24(cascadeinfo1.y,readwidth,cascadeinfo1.z)] -
+                    lcldata[mad24(cascadeinfo1.w,readwidth,cascadeinfo1.x)] + lcldata[mad24(cascadeinfo1.w,readwidth,cascadeinfo1.z)])
+                    *correction;

-				int p_offset = mad24(y, pixelstep, x);
+                int p_offset = mad24(y, pixelstep, x);

-				cascadeinfo2.x +=p_offset;
-				cascadeinfo2.z +=p_offset;
-				variance_norm_factor =sqsum[mad24(cascadeinfo2.y, pixelstep, cascadeinfo2.x)] - sqsum[mad24(cascadeinfo2.y, pixelstep, cascadeinfo2.z)] -   
-					sqsum[mad24(cascadeinfo2.w, pixelstep, cascadeinfo2.x)] + sqsum[mad24(cascadeinfo2.w, pixelstep, cascadeinfo2.z)]; 
+                cascadeinfo2.x +=p_offset;
+                cascadeinfo2.z +=p_offset;
+                variance_norm_factor =sqsum[mad24(cascadeinfo2.y, pixelstep, cascadeinfo2.x)] - sqsum[mad24(cascadeinfo2.y, pixelstep, cascadeinfo2.z)] -
+                    sqsum[mad24(cascadeinfo2.w, pixelstep, cascadeinfo2.x)] + sqsum[mad24(cascadeinfo2.w, pixelstep, cascadeinfo2.z)];

-				variance_norm_factor = variance_norm_factor * correction - mean * mean;
-				variance_norm_factor = variance_norm_factor >=0.f ? sqrt(variance_norm_factor) : 1.f;
-				//if( cascade->is_stump_based )              
-				//{
-				for(int stageloop = start_stage; (stageloop < split_stage)  && result; stageloop++ )
-				{
-					float stage_sum = 0.f;
-					int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop);
-					float stagethreshold = as_float(stageinfo.y);
-					for(int nodeloop = 0; nodeloop < stageinfo.x; nodeloop++ )
-					{
-						__global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter);
+                variance_norm_factor = variance_norm_factor * correction - mean * mean;
+                variance_norm_factor = variance_norm_factor >=0.f ? sqrt(variance_norm_factor) : 1.f;
+                //if( cascade->is_stump_based )
+                //{
+                for(int stageloop = start_stage; (stageloop < split_stage)  && result; stageloop++ )
+                {
+                    float stage_sum = 0.f;
+                    int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop);
+                    float stagethreshold = as_float(stageinfo.y);
+                    for(int nodeloop = 0; nodeloop < stageinfo.x; nodeloop++ )
+                    {
+                        __global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter);

-						int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
-						int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
-						int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
-						float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
-						float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0]));
-						float nodethreshold  = w.w * variance_norm_factor;
+                        int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
+                        int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
+                        int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
+                        float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
+                        float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0]));
+                        float nodethreshold  = w.w * variance_norm_factor;

-						info1.x +=lcl_off;
-						info1.z +=lcl_off;
-						info2.x +=lcl_off;
-						info2.z +=lcl_off;
+                        info1.x +=lcl_off;
+                        info1.z +=lcl_off;
+                        info2.x +=lcl_off;
+                        info2.z +=lcl_off;

-						float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] - 
-							lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x;
+                        float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] -
+                            lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x;


-						classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] - 
-							lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y;
+                        classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] -
+                            lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y;


-						//if((info3.z - info3.x) && (!stageinfo.z))
-						//{
-							info3.x +=lcl_off;
-							info3.z +=lcl_off;
-							classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] - 
-								lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z;
-						//}
-						stage_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
-						nodecounter++;
-					}
+                        //if((info3.z - info3.x) && (!stageinfo.z))
+                        //{
+                            info3.x +=lcl_off;
+                            info3.z +=lcl_off;
+                            classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] -
+                                lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z;
+                        //}
+                        stage_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
+                        nodecounter++;
+                    }

-					result = (stage_sum >= stagethreshold);
-				}
+                    result = (stage_sum >= stagethreshold);
+                }

-				if(result && (x < width) && (y < height))
-				{
-					int queueindex = atomic_inc(lclcount);
-					lcloutindex[queueindex<<1] = (lclidy << 16) | lclidx;
-					lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor);
-				}
-				barrier(CLK_LOCAL_MEM_FENCE);
-				int queuecount  = lclcount[0];
-				nodecounter = splitnode;
-				for(int stageloop = split_stage; stageloop< end_stage && queuecount>0;stageloop++)
-				{
-				  //barrier(CLK_LOCAL_MEM_FENCE);
-					//if(lcl_id == 0)  
+                if(result && (x < width) && (y < height))
+                {
+                    int queueindex = atomic_inc(lclcount);
+                    lcloutindex[queueindex<<1] = (lclidy << 16) | lclidx;
+                    lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor);
+                }
+                barrier(CLK_LOCAL_MEM_FENCE);
+                int queuecount  = lclcount[0];
+                nodecounter = splitnode;
+                for(int stageloop = split_stage; stageloop< end_stage && queuecount>0;stageloop++)
+                {
+                  //barrier(CLK_LOCAL_MEM_FENCE);
+                    //if(lcl_id == 0)
            lclcount[0]=0;
-					barrier(CLK_LOCAL_MEM_FENCE);
+                    barrier(CLK_LOCAL_MEM_FENCE);

-					int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop);
-					float stagethreshold = as_float(stageinfo.y);
+                    int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop);
+                    float stagethreshold = as_float(stageinfo.y);

-					int perfscale = queuecount > 4 ? 3 : 2;
-					int queuecount_loop = (queuecount + (1<<perfscale)-1) >> perfscale;
-					int lcl_compute_win = lcl_sz >> perfscale;
-					int lcl_compute_win_id = (lcl_id >>(6-perfscale));
-					int lcl_loops = (stageinfo.x + lcl_compute_win -1) >> (6-perfscale);
-					int lcl_compute_id = lcl_id - (lcl_compute_win_id << (6-perfscale));
-					for(int queueloop=0;queueloop<queuecount_loop/* && lcl_compute_win_id < queuecount*/;queueloop++)
-					{
-						float stage_sum = 0.f;
-						int temp_coord = lcloutindex[lcl_compute_win_id<<1];
-						float variance_norm_factor = as_float(lcloutindex[(lcl_compute_win_id<<1)+1]);
-						int queue_pixel = mad24(((temp_coord  & (int)0xffff0000)>>16),readwidth,temp_coord & 0xffff);
+                    int perfscale = queuecount > 4 ? 3 : 2;
+                    int queuecount_loop = (queuecount + (1<<perfscale)-1) >> perfscale;
+                    int lcl_compute_win = lcl_sz >> perfscale;
+                    int lcl_compute_win_id = (lcl_id >>(6-perfscale));
+                    int lcl_loops = (stageinfo.x + lcl_compute_win -1) >> (6-perfscale);
+                    int lcl_compute_id = lcl_id - (lcl_compute_win_id << (6-perfscale));
+                    for(int queueloop=0;queueloop<queuecount_loop/* && lcl_compute_win_id < queuecount*/;queueloop++)
+                    {
+                        float stage_sum = 0.f;
+                        int temp_coord = lcloutindex[lcl_compute_win_id<<1];
+                        float variance_norm_factor = as_float(lcloutindex[(lcl_compute_win_id<<1)+1]);
+                        int queue_pixel = mad24(((temp_coord  & (int)0xffff0000)>>16),readwidth,temp_coord & 0xffff);

-					  //barrier(CLK_LOCAL_MEM_FENCE);
+                      //barrier(CLK_LOCAL_MEM_FENCE);
            if(lcl_compute_win_id < queuecount) {
-						
+
            int tempnodecounter = lcl_compute_id;
-						float part_sum = 0.f;
-						for(int lcl_loop=0;lcl_loop<lcl_loops && tempnodecounter<stageinfo.x;lcl_loop++)
-						{
-							__global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter + tempnodecounter);
+                        float part_sum = 0.f;
+                        for(int lcl_loop=0;lcl_loop<lcl_loops && tempnodecounter<stageinfo.x;lcl_loop++)
+                        {
+                            __global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter + tempnodecounter);

-							int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
-							int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
-							int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
-							float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
-							float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0]));
-							float nodethreshold  = w.w * variance_norm_factor;
+                            int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
+                            int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
+                            int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
+                            float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
+                            float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0]));
+                            float nodethreshold  = w.w * variance_norm_factor;

-							info1.x +=queue_pixel;
-							info1.z +=queue_pixel;
-							info2.x +=queue_pixel;
-							info2.z +=queue_pixel;
+                            info1.x +=queue_pixel;
+                            info1.z +=queue_pixel;
+                            info2.x +=queue_pixel;
+                            info2.z +=queue_pixel;

-							float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] - 
-								lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x;
+                            float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] -
+                                lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x;


-							classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] - 
-								lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y;
-						//if((info3.z - info3.x) && (!stageinfo.z))
-						//{
-								info3.x +=queue_pixel;
-								info3.z +=queue_pixel;
-								classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] - 
-									lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z;
-						//}
-							part_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
-							tempnodecounter +=lcl_compute_win;
-						}//end for(int lcl_loop=0;lcl_loop<lcl_loops;lcl_loop++)
-						partialsum[lcl_id]=part_sum;
+                            classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] -
+                                lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y;
+                        //if((info3.z - info3.x) && (!stageinfo.z))
+                        //{
+                                info3.x +=queue_pixel;
+                                info3.z +=queue_pixel;
+                                classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] -
+                                    lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z;
+                        //}
+                            part_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
+                            tempnodecounter +=lcl_compute_win;
+                        }//end for(int lcl_loop=0;lcl_loop<lcl_loops;lcl_loop++)
+                        partialsum[lcl_id]=part_sum;
            }
-						barrier(CLK_LOCAL_MEM_FENCE);
+                        barrier(CLK_LOCAL_MEM_FENCE);
            if(lcl_compute_win_id < queuecount) {
-						for(int i=0;i<lcl_compute_win && (lcl_compute_id==0);i++)
-						{
-							stage_sum += partialsum[lcl_id+i];
-						}
-						if(stage_sum >= stagethreshold && (lcl_compute_id==0))
-						{
-							int queueindex = atomic_inc(lclcount);
-							lcloutindex[queueindex<<1] = temp_coord;
-							lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor);
-						}
-						lcl_compute_win_id +=(1<<perfscale);
+                        for(int i=0;i<lcl_compute_win && (lcl_compute_id==0);i++)
+                        {
+                            stage_sum += partialsum[lcl_id+i];
+                        }
+                        if(stage_sum >= stagethreshold && (lcl_compute_id==0))
+                        {
+                            int queueindex = atomic_inc(lclcount);
+                            lcloutindex[queueindex<<1] = temp_coord;
+                            lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor);
+                        }
+                        lcl_compute_win_id +=(1<<perfscale);
            }
-						barrier(CLK_LOCAL_MEM_FENCE);
-					}//end for(int queueloop=0;queueloop<queuecount_loop;queueloop++)
-				  barrier(CLK_LOCAL_MEM_FENCE);
-					queuecount = lclcount[0];
-					nodecounter += stageinfo.x;
-				}//end for(int stageloop = splitstage; stageloop< endstage && queuecount>0;stageloop++)
-				//barrier(CLK_LOCAL_MEM_FENCE);
-				if(lcl_id<queuecount)
-				{
-					int temp = lcloutindex[lcl_id<<1];
-					int x = mad24(grpidx,grpszx,temp & 0xffff);
-					int y = mad24(grpidy,grpszy,((temp & (int)0xffff0000) >> 16));
-					temp = glboutindex[0];
-					int4 candidate_result;
-					candidate_result.zw = (int2)convert_int_rtn(factor*20.f);
-					candidate_result.x = convert_int_rtn(x*factor);
-					candidate_result.y = convert_int_rtn(y*factor);
-					atomic_inc(glboutindex);
-					candidate[outputoff+temp+lcl_id] = candidate_result;
-				}
-				barrier(CLK_LOCAL_MEM_FENCE);
-			}//end if((x < width) && (y < height))
-		}//end for(int grploop=grpidx;grploop<totalgrp;grploop+=grpnumx)
-		//outputoff +=mul24(width,height);
-	}//end for(int scalei = 0; scalei <loopcount; scalei++)
+                        barrier(CLK_LOCAL_MEM_FENCE);
+                    }//end for(int queueloop=0;queueloop<queuecount_loop;queueloop++)
+                  barrier(CLK_LOCAL_MEM_FENCE);
+                    queuecount = lclcount[0];
+                    nodecounter += stageinfo.x;
+                }//end for(int stageloop = splitstage; stageloop< endstage && queuecount>0;stageloop++)
+                //barrier(CLK_LOCAL_MEM_FENCE);
+                if(lcl_id<queuecount)
+                {
+                    int temp = lcloutindex[lcl_id<<1];
+                    int x = mad24(grpidx,grpszx,temp & 0xffff);
+                    int y = mad24(grpidy,grpszy,((temp & (int)0xffff0000) >> 16));
+                    temp = glboutindex[0];
+                    int4 candidate_result;
+                    candidate_result.zw = (int2)convert_int_rtn(factor*20.f);
+                    candidate_result.x = convert_int_rtn(x*factor);
+                    candidate_result.y = convert_int_rtn(y*factor);
+                    atomic_inc(glboutindex);
+                    candidate[outputoff+temp+lcl_id] = candidate_result;
+                }
+                barrier(CLK_LOCAL_MEM_FENCE);
+            }//end if((x < width) && (y < height))
+        }//end for(int grploop=grpidx;grploop<totalgrp;grploop+=grpnumx)
+        //outputoff +=mul24(width,height);
+    }//end for(int scalei = 0; scalei <loopcount; scalei++)
 }
-	
-				
-	
+
+
+



@@ -421,7 +421,7 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa


                /*
-                if(stagecascade->two_rects) 
+                if(stagecascade->two_rects)
                {
                    #pragma unroll
                    for( n = 0; n < stagecascade->count; n++ )
@@ -429,10 +429,10 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
                        t1 = *(node + counter);
                        t = t1.threshold * variance_norm_factor;
                        classsum = calc_sum1(t1,p_offset,0) * t1.weight[0];
-                       
+
                        classsum  += calc_sum1(t1, p_offset,1) * t1.weight[1];
                        stage_sum += classsum >= t ? t1.alpha[1]:t1.alpha[0];
-                       
+
                        counter++;
                    }
                }
@@ -444,75 +444,75 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
                        t = node[counter].threshold*variance_norm_factor;
                        classsum = calc_sum1(node[counter],p_offset,0) * node[counter].weight[0];
                        classsum += calc_sum1(node[counter],p_offset,1) * node[counter].weight[1];
-                       
+
                        if( node[counter].p0[2] )
                            classsum += calc_sum1(node[counter],p_offset,2) * node[counter].weight[2];
-                         
+
                        stage_sum += classsum >= t ? node[counter].alpha[1]:node[counter].alpha[0];// modify
-                       
+
                        counter++;
                    }
                }
                */
-				/*
+                /*
 __kernel void gpuRunHaarClassifierCascade_ScaleWindow(
-										  constant GpuHidHaarClassifierCascade * _cascade, 
-										  global GpuHidHaarStageClassifier * stagecascadeptr,
-                                          //global GpuHidHaarClassifier * classifierptr,    
-										  global GpuHidHaarTreeNode * nodeptr,
-                                          global int * sum, 
-										  global float * sqsum, 
-										  global int * _candidate,
+                                          constant GpuHidHaarClassifierCascade * _cascade,
+                                          global GpuHidHaarStageClassifier * stagecascadeptr,
+                                          //global GpuHidHaarClassifier * classifierptr,
+                                          global GpuHidHaarTreeNode * nodeptr,
+                                          global int * sum,
+                                          global float * sqsum,
+                                          global int * _candidate,
                                          int pixel_step,
-										  int cols,
-										  int rows,
-										  int start_stage, 
-										  int end_stage,
+                                          int cols,
+                                          int rows,
+                                          int start_stage,
+                                          int end_stage,
                                          //int counts,
-										  int nodenum, 
-										  int ystep, 
-										  int detect_width, 
-										  //int detect_height,
-										  int loopcount,
-										  int outputstep)
-										  //float scalefactor)
+                                          int nodenum,
+                                          int ystep,
+                                          int detect_width,
+                                          //int detect_height,
+                                          int loopcount,
+                                          int outputstep)
+                                          //float scalefactor)
 {
-	unsigned int x1 = get_global_id(0);
-	unsigned int y1 = get_global_id(1);
-	int p_offset;
-	int m, n;
-	int result;
-	int counter;
-	float mean, variance_norm_factor;
-	for(int i=0;i<loopcount;i++)
-	{
-		constant GpuHidHaarClassifierCascade * cascade = _cascade + i;
-		global int * candidate = _candidate + i*outputstep;
-		int window_width = cascade->p1 - cascade->p0;
-		int window_height = window_width;
-		result = 1;
-		counter = 0;
-			unsigned int x = mul24(x1,ystep);
-			unsigned int y = mul24(y1,ystep);
-		if((x < cols - window_width - 1) && (y < rows - window_height -1))
-		{
-			global GpuHidHaarStageClassifier *stagecascade = stagecascadeptr +cascade->count*i+ start_stage;
-			//global GpuHidHaarClassifier      *classifier   = classifierptr;
-			global GpuHidHaarTreeNode        *node         = nodeptr + nodenum*i;
+    unsigned int x1 = get_global_id(0);
+    unsigned int y1 = get_global_id(1);
+    int p_offset;
+    int m, n;
+    int result;
+    int counter;
+    float mean, variance_norm_factor;
+    for(int i=0;i<loopcount;i++)
+    {
+        constant GpuHidHaarClassifierCascade * cascade = _cascade + i;
+        global int * candidate = _candidate + i*outputstep;
+        int window_width = cascade->p1 - cascade->p0;
+        int window_height = window_width;
+        result = 1;
+        counter = 0;
+            unsigned int x = mul24(x1,ystep);
+            unsigned int y = mul24(y1,ystep);
+        if((x < cols - window_width - 1) && (y < rows - window_height -1))
+        {
+            global GpuHidHaarStageClassifier *stagecascade = stagecascadeptr +cascade->count*i+ start_stage;
+            //global GpuHidHaarClassifier      *classifier   = classifierptr;
+            global GpuHidHaarTreeNode        *node         = nodeptr + nodenum*i;

-			p_offset = mad24(y, pixel_step, x);// modify
+            p_offset = mad24(y, pixel_step, x);// modify

-			mean = (*(sum + p_offset + (int)cascade->p0) - *(sum + p_offset + (int)cascade->p1) - 
-					*(sum + p_offset + (int)cascade->p2) + *(sum + p_offset + (int)cascade->p3))
-					*cascade->inv_window_area;
+            mean = (*(sum + p_offset + (int)cascade->p0) - *(sum + p_offset + (int)cascade->p1) -
+                    *(sum + p_offset + (int)cascade->p2) + *(sum + p_offset + (int)cascade->p3))
+                    *cascade->inv_window_area;

-			variance_norm_factor = *(sqsum + p_offset + cascade->p0) - *(sqsum + cascade->p1 + p_offset) -
-									*(sqsum + p_offset + cascade->p2) + *(sqsum + cascade->p3 + p_offset);
-			variance_norm_factor = variance_norm_factor * cascade->inv_window_area - mean * mean;
-			variance_norm_factor = variance_norm_factor >=0.f ? sqrt(variance_norm_factor) : 1;//modify
+            variance_norm_factor = *(sqsum + p_offset + cascade->p0) - *(sqsum + cascade->p1 + p_offset) -
+                                    *(sqsum + p_offset + cascade->p2) + *(sqsum + cascade->p3 + p_offset);
+            variance_norm_factor = variance_norm_factor * cascade->inv_window_area - mean * mean;
+            variance_norm_factor = variance_norm_factor >=0.f ? sqrt(variance_norm_factor) : 1;//modify

-			// if( cascade->is_stump_based )              
-			//{
+            // if( cascade->is_stump_based )
+            //{
            for( m = start_stage; m < end_stage; m++ )
            {
                float stage_sum = 0.f;
@@ -532,29 +532,29 @@ __kernel void gpuRunHaarClassifierCascade_ScaleWindow(
                     stage_sum += classsum >= t ? t1.alpha[1] : t1.alpha[0];// modify
                     counter++;
                }
-                      
+
                if (stage_sum < stagecascade->threshold)
                {
-					result = 0;
-                    break;   
+                    result = 0;
+                    break;
                }

                stagecascade++;

            }
-				if(result)
-				{
-					candidate[4 * (y1 * detect_width + x1)]     = x;
-					candidate[4 * (y1 * detect_width + x1) + 1] = y;
-					candidate[4 * (y1 * detect_width + x1)+2]     = window_width;
-					candidate[4 * (y1 * detect_width + x1) + 3] = window_height;
-				}
-			//}
-		}
-	}
+                if(result)
+                {
+                    candidate[4 * (y1 * detect_width + x1)]     = x;
+                    candidate[4 * (y1 * detect_width + x1) + 1] = y;
+                    candidate[4 * (y1 * detect_width + x1)+2]     = window_width;
+                    candidate[4 * (y1 * detect_width + x1) + 3] = window_height;
+                }
+            //}
+        }
+    }
 }
 */

-				
-				
-	
+
+
+
--- a/modules/ocl/src/kernels/haarobjectdetect_scaled2.cl
+++ b/modules/ocl/src/kernels/haarobjectdetect_scaled2.cl
@@ -50,89 +50,89 @@ typedef int   sumtype;
 typedef float sqsumtype;
 typedef struct  __attribute__((aligned (128)))  GpuHidHaarFeature
 {
-	struct __attribute__((aligned (32)))
-	{
-		int p0 __attribute__((aligned (4))); 
-		int p1 __attribute__((aligned (4))); 
-		int p2 __attribute__((aligned (4))); 
-		int p3 __attribute__((aligned (4)));
-		float weight __attribute__((aligned (4)));
-	}
-	rect[CV_HAAR_FEATURE_MAX] __attribute__((aligned (32)));
+    struct __attribute__((aligned (32)))
+    {
+        int p0 __attribute__((aligned (4)));
+        int p1 __attribute__((aligned (4)));
+        int p2 __attribute__((aligned (4)));
+        int p3 __attribute__((aligned (4)));
+        float weight __attribute__((aligned (4)));
+    }
+    rect[CV_HAAR_FEATURE_MAX] __attribute__((aligned (32)));
 }
 GpuHidHaarFeature;
 typedef struct __attribute__((aligned (128) )) GpuHidHaarTreeNode
 {
-	int p[CV_HAAR_FEATURE_MAX][4] __attribute__((aligned (64)));
-	float weight[CV_HAAR_FEATURE_MAX] /*__attribute__((aligned (16)))*/;
-	float threshold /*__attribute__((aligned (4)))*/;
-	float alpha[2] __attribute__((aligned (8)));
-	int left __attribute__((aligned (4)));
-	int right __attribute__((aligned (4)));
+    int p[CV_HAAR_FEATURE_MAX][4] __attribute__((aligned (64)));
+    float weight[CV_HAAR_FEATURE_MAX] /*__attribute__((aligned (16)))*/;
+    float threshold /*__attribute__((aligned (4)))*/;
+    float alpha[2] __attribute__((aligned (8)));
+    int left __attribute__((aligned (4)));
+    int right __attribute__((aligned (4)));
 }
 GpuHidHaarTreeNode;
 typedef struct __attribute__((aligned (32))) GpuHidHaarClassifier
 {
-	int count __attribute__((aligned (4)));
-	GpuHidHaarTreeNode* node __attribute__((aligned (8)));
-	float* alpha __attribute__((aligned (8)));
+    int count __attribute__((aligned (4)));
+    GpuHidHaarTreeNode* node __attribute__((aligned (8)));
+    float* alpha __attribute__((aligned (8)));
 }
 GpuHidHaarClassifier;
 typedef struct __attribute__((aligned (64))) GpuHidHaarStageClassifier
 {
-	int  count __attribute__((aligned (4)));
-	float threshold __attribute__((aligned (4)));
-	int two_rects __attribute__((aligned (4)));
-	int reserved0 __attribute__((aligned (8)));
-	int reserved1 __attribute__((aligned (8)));
-	int reserved2 __attribute__((aligned (8)));
-	int reserved3 __attribute__((aligned (8)));
+    int  count __attribute__((aligned (4)));
+    float threshold __attribute__((aligned (4)));
+    int two_rects __attribute__((aligned (4)));
+    int reserved0 __attribute__((aligned (8)));
+    int reserved1 __attribute__((aligned (8)));
+    int reserved2 __attribute__((aligned (8)));
+    int reserved3 __attribute__((aligned (8)));
 }
 GpuHidHaarStageClassifier;
 typedef struct __attribute__((aligned (64))) GpuHidHaarClassifierCascade
 {
-	int  count __attribute__((aligned (4)));
-	int  is_stump_based __attribute__((aligned (4)));
-	int  has_tilted_features __attribute__((aligned (4)));
-	int  is_tree __attribute__((aligned (4)));
-	int pq0 __attribute__((aligned (4))); 
-	int pq1 __attribute__((aligned (4)));
-	int pq2 __attribute__((aligned (4)));
-	int pq3 __attribute__((aligned (4)));
-	int p0 __attribute__((aligned (4)));
-	int p1 __attribute__((aligned (4))); 
-	int p2 __attribute__((aligned (4))); 
-	int p3 __attribute__((aligned (4)));
-	float inv_window_area __attribute__((aligned (4)));
+    int  count __attribute__((aligned (4)));
+    int  is_stump_based __attribute__((aligned (4)));
+    int  has_tilted_features __attribute__((aligned (4)));
+    int  is_tree __attribute__((aligned (4)));
+    int pq0 __attribute__((aligned (4)));
+    int pq1 __attribute__((aligned (4)));
+    int pq2 __attribute__((aligned (4)));
+    int pq3 __attribute__((aligned (4)));
+    int p0 __attribute__((aligned (4)));
+    int p1 __attribute__((aligned (4)));
+    int p2 __attribute__((aligned (4)));
+    int p3 __attribute__((aligned (4)));
+    float inv_window_area __attribute__((aligned (4)));
 }GpuHidHaarClassifierCascade;
- 
+
 __kernel void gpuRunHaarClassifierCascade_scaled2(
-	global GpuHidHaarStageClassifier * stagecascadeptr,
-	global int4 * info, 
-	global GpuHidHaarTreeNode * nodeptr,
-	global const int * restrict sum,
-	global const float *  restrict sqsum,
-	global int4 * candidate,
-	const int step,
-	const int loopcount,
-	const int start_stage,
+    global GpuHidHaarStageClassifier * stagecascadeptr,
+    global int4 * info,
+    global GpuHidHaarTreeNode * nodeptr,
+    global const int * restrict sum,
+    global const float *  restrict sqsum,
+    global int4 * candidate,
+    const int step,
+    const int loopcount,
+    const int start_stage,
    const int split_stage,
-	const int end_stage,
-	const int startnode,
+    const int end_stage,
+    const int startnode,
    const int splitnode,
    global int4 * p,
-									  //const int4 * pq,
-	global float * correction,
+                                      //const int4 * pq,
+    global float * correction,
   const int nodecount)
 {
-	int grpszx = get_local_size(0);
-	int grpszy = get_local_size(1);
-	int grpnumx = get_num_groups(0);
+    int grpszx = get_local_size(0);
+    int grpszy = get_local_size(1);
+    int grpnumx = get_num_groups(0);
    int grpidx=get_group_id(0);
-	int lclidx = get_local_id(0);
-	int lclidy = get_local_id(1);
-	int lcl_sz = mul24(grpszx,grpszy);
-	int lcl_id = mad24(lclidy,grpszx,lclidx);
+    int lclidx = get_local_id(0);
+    int lclidy = get_local_id(1);
+    int lcl_sz = mul24(grpszx,grpszy);
+    int lcl_id = mad24(lclidy,grpszx,lclidx);
    __local int lclshare[1024];
    __local int* glboutindex=lclshare+0;
    __local int* lclcount=glboutindex+1;
@@ -140,85 +140,85 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
    __local float* partialsum=(__local float*)(lcloutindex+(lcl_sz<<1));
    glboutindex[0]=0;
    int outputoff = mul24(grpidx,256);
-	candidate[outputoff+(lcl_id<<2)] = (int4)0;
-	candidate[outputoff+(lcl_id<<2)+1] = (int4)0;
-	candidate[outputoff+(lcl_id<<2)+2] = (int4)0;
-	candidate[outputoff+(lcl_id<<2)+3] = (int4)0;
-	for(int scalei = 0; scalei <loopcount; scalei++)
-	{
-		int4 scaleinfo1;
-		scaleinfo1 = info[scalei];
-		int width = (scaleinfo1.x & 0xffff0000) >> 16;
-		int height = scaleinfo1.x & 0xffff;
-		int grpnumperline =(scaleinfo1.y & 0xffff0000) >> 16;
-		int totalgrp = scaleinfo1.y & 0xffff;
-		float factor = as_float(scaleinfo1.w);
-		float correction_t=correction[scalei];
-		int ystep=(int)(max(2.0f,factor)+0.5f);
-		for(int grploop=get_group_id(0);grploop<totalgrp;grploop+=grpnumx){
-		    int4 cascadeinfo=p[scalei];
-			int grpidy = grploop / grpnumperline;
-			int grpidx = grploop - mul24(grpidy, grpnumperline);
-			int ix = mad24(grpidx,grpszx,lclidx);
-			int iy = mad24(grpidy,grpszy,lclidy);
+    candidate[outputoff+(lcl_id<<2)] = (int4)0;
+    candidate[outputoff+(lcl_id<<2)+1] = (int4)0;
+    candidate[outputoff+(lcl_id<<2)+2] = (int4)0;
+    candidate[outputoff+(lcl_id<<2)+3] = (int4)0;
+    for(int scalei = 0; scalei <loopcount; scalei++)
+    {
+        int4 scaleinfo1;
+        scaleinfo1 = info[scalei];
+        int width = (scaleinfo1.x & 0xffff0000) >> 16;
+        int height = scaleinfo1.x & 0xffff;
+        int grpnumperline =(scaleinfo1.y & 0xffff0000) >> 16;
+        int totalgrp = scaleinfo1.y & 0xffff;
+        float factor = as_float(scaleinfo1.w);
+        float correction_t=correction[scalei];
+        int ystep=(int)(max(2.0f,factor)+0.5f);
+        for(int grploop=get_group_id(0);grploop<totalgrp;grploop+=grpnumx){
+            int4 cascadeinfo=p[scalei];
+            int grpidy = grploop / grpnumperline;
+            int grpidx = grploop - mul24(grpidy, grpnumperline);
+            int ix = mad24(grpidx,grpszx,lclidx);
+            int iy = mad24(grpidy,grpszy,lclidy);
            int x=ix*ystep;
            int y=iy*ystep;
            lcloutindex[lcl_id]=0;
            lclcount[0]=0;
-		    int result=1,nodecounter;
-		    float mean,variance_norm_factor;
-			//if((ix < width) && (iy < height))
+            int result=1,nodecounter;
+            float mean,variance_norm_factor;
+            //if((ix < width) && (iy < height))
            {
-				const int p_offset = mad24(y, step, x);
-				cascadeinfo.x +=p_offset;
-				cascadeinfo.z +=p_offset;
-				mean = (sum[mad24(cascadeinfo.y,step,cascadeinfo.x)] - sum[mad24(cascadeinfo.y,step,cascadeinfo.z)] - 
-					sum[mad24(cascadeinfo.w,step,cascadeinfo.x)] + sum[mad24(cascadeinfo.w,step,cascadeinfo.z)])
-					*correction_t;
-				variance_norm_factor =sqsum[mad24(cascadeinfo.y,step, cascadeinfo.x)] - sqsum[mad24(cascadeinfo.y, step, cascadeinfo.z)] -   
-				sqsum[mad24(cascadeinfo.w, step, cascadeinfo.x)] + sqsum[mad24(cascadeinfo.w, step, cascadeinfo.z)]; 
-				variance_norm_factor = variance_norm_factor * correction_t - mean * mean;
-				variance_norm_factor = variance_norm_factor >=0.f ? sqrt(variance_norm_factor) : 1.f;
-				result = 1;
-				nodecounter = startnode+nodecount*scalei;
-				for(int stageloop = start_stage; stageloop < split_stage&&result; stageloop++ )
-				{
-					float stage_sum = 0.f;
-					int4 stageinfo = *(global int4*)(stagecascadeptr+stageloop);
-					float stagethreshold = as_float(stageinfo.y);
-					for(int nodeloop = 0; nodeloop < stageinfo.x; nodeloop++ )
-					{
-						__global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter);
-						int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
-						int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
-						int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
-						float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
-						float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0]));
+                const int p_offset = mad24(y, step, x);
+                cascadeinfo.x +=p_offset;
+                cascadeinfo.z +=p_offset;
+                mean = (sum[mad24(cascadeinfo.y,step,cascadeinfo.x)] - sum[mad24(cascadeinfo.y,step,cascadeinfo.z)] -
+                    sum[mad24(cascadeinfo.w,step,cascadeinfo.x)] + sum[mad24(cascadeinfo.w,step,cascadeinfo.z)])
+                    *correction_t;
+                variance_norm_factor =sqsum[mad24(cascadeinfo.y,step, cascadeinfo.x)] - sqsum[mad24(cascadeinfo.y, step, cascadeinfo.z)] -
+                sqsum[mad24(cascadeinfo.w, step, cascadeinfo.x)] + sqsum[mad24(cascadeinfo.w, step, cascadeinfo.z)];
+                variance_norm_factor = variance_norm_factor * correction_t - mean * mean;
+                variance_norm_factor = variance_norm_factor >=0.f ? sqrt(variance_norm_factor) : 1.f;
+                result = 1;
+                nodecounter = startnode+nodecount*scalei;
+                for(int stageloop = start_stage; stageloop < split_stage&&result; stageloop++ )
+                {
+                    float stage_sum = 0.f;
+                    int4 stageinfo = *(global int4*)(stagecascadeptr+stageloop);
+                    float stagethreshold = as_float(stageinfo.y);
+                    for(int nodeloop = 0; nodeloop < stageinfo.x; nodeloop++ )
+                    {
+                        __global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter);
+                        int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
+                        int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
+                        int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
+                        float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
+                        float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0]));
                       float nodethreshold  = w.w * variance_norm_factor;
-						info1.x +=p_offset;
-						info1.z +=p_offset;
-						info2.x +=p_offset;
-						info2.z +=p_offset;
-						float classsum = (sum[mad24(info1.y,step,info1.x)] - sum[mad24(info1.y,step,info1.z)] - 
-							sum[mad24(info1.w,step,info1.x)] + sum[mad24(info1.w,step,info1.z)]) * w.x;
-						classsum += (sum[mad24(info2.y,step,info2.x)] - sum[mad24(info2.y,step,info2.z)] - 
-							sum[mad24(info2.w,step,info2.x)] + sum[mad24(info2.w,step,info2.z)]) * w.y;
-						info3.x +=p_offset;
-						info3.z +=p_offset;
-						 classsum += (sum[mad24(info3.y,step,info3.x)] - sum[mad24(info3.y,step,info3.z)] - 
-							sum[mad24(info3.w,step,info3.x)] + sum[mad24(info3.w,step,info3.z)]) * w.z;
-						stage_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
-						nodecounter++;
-					}
-					result=(stage_sum>=stagethreshold);
-				}
-				if(result&&(ix<width)&&(iy<height))
+                        info1.x +=p_offset;
+                        info1.z +=p_offset;
+                        info2.x +=p_offset;
+                        info2.z +=p_offset;
+                        float classsum = (sum[mad24(info1.y,step,info1.x)] - sum[mad24(info1.y,step,info1.z)] -
+                            sum[mad24(info1.w,step,info1.x)] + sum[mad24(info1.w,step,info1.z)]) * w.x;
+                        classsum += (sum[mad24(info2.y,step,info2.x)] - sum[mad24(info2.y,step,info2.z)] -
+                            sum[mad24(info2.w,step,info2.x)] + sum[mad24(info2.w,step,info2.z)]) * w.y;
+                        info3.x +=p_offset;
+                        info3.z +=p_offset;
+                         classsum += (sum[mad24(info3.y,step,info3.x)] - sum[mad24(info3.y,step,info3.z)] -
+                            sum[mad24(info3.w,step,info3.x)] + sum[mad24(info3.w,step,info3.z)]) * w.z;
+                        stage_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
+                        nodecounter++;
+                    }
+                    result=(stage_sum>=stagethreshold);
+                }
+                if(result&&(ix<width)&&(iy<height))
                {
                     int queueindex=atomic_inc(lclcount);
                     lcloutindex[queueindex<<1]=(y<<16)|x;
                     lcloutindex[(queueindex<<1)+1]=as_int(variance_norm_factor);
                }
-                barrier(CLK_LOCAL_MEM_FENCE); 
+                barrier(CLK_LOCAL_MEM_FENCE);
                int queuecount=lclcount[0];
                nodecounter=splitnode+nodecount*scalei;
                for(int stageloop=split_stage;stageloop<end_stage&&queuecount>0;stageloop++)
@@ -244,34 +244,34 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
                          for(int lcl_loop=0;lcl_loop<lcl_loops&&tempnodecounter<stageinfo.x;lcl_loop++)
                          {
                              __global  GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter + tempnodecounter);
-							  int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
-							  int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
-							  int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
-							  float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
-							  float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0]));
-							  float nodethreshold  = w.w * variance_norm_factor;
-							  info1.x +=queue_offset;
-							  info1.z +=queue_offset;
-							  info2.x +=queue_offset;
-							  info2.z +=queue_offset;
-							  float classsum = (sum[mad24(info1.y,step,info1.x)] - sum[mad24(info1.y,step,info1.z)] - 
-								sum[mad24(info1.w,step,info1.x)] + sum[mad24(info1.w,step,info1.z)]) * w.x;
-							  classsum += (sum[mad24(info2.y,step,info2.x)] - sum[mad24(info2.y,step,info2.z)] - 
-							  sum[mad24(info2.w,step,info2.x)] + sum[mad24(info2.w,step,info2.z)]) * w.y;
-			
-							  info3.x +=queue_offset;
-							  info3.z +=queue_offset;
-							  classsum += (sum[mad24(info3.y,step,info3.x)] - sum[mad24(info3.y,step,info3.z)] - 
-									sum[mad24(info3.w,step,info3.x)] + sum[mad24(info3.w,step,info3.z)]) * w.z;
-							  part_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
-							  tempnodecounter+=lcl_compute_win;
+                              int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
+                              int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
+                              int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
+                              float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
+                              float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0]));
+                              float nodethreshold  = w.w * variance_norm_factor;
+                              info1.x +=queue_offset;
+                              info1.z +=queue_offset;
+                              info2.x +=queue_offset;
+                              info2.z +=queue_offset;
+                              float classsum = (sum[mad24(info1.y,step,info1.x)] - sum[mad24(info1.y,step,info1.z)] -
+                                sum[mad24(info1.w,step,info1.x)] + sum[mad24(info1.w,step,info1.z)]) * w.x;
+                              classsum += (sum[mad24(info2.y,step,info2.x)] - sum[mad24(info2.y,step,info2.z)] -
+                              sum[mad24(info2.w,step,info2.x)] + sum[mad24(info2.w,step,info2.z)]) * w.y;
+
+                              info3.x +=queue_offset;
+                              info3.z +=queue_offset;
+                              classsum += (sum[mad24(info3.y,step,info3.x)] - sum[mad24(info3.y,step,info3.z)] -
+                                    sum[mad24(info3.w,step,info3.x)] + sum[mad24(info3.w,step,info3.z)]) * w.z;
+                              part_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
+                              tempnodecounter+=lcl_compute_win;
                         }
                         partialsum[lcl_id]=part_sum;
-                         barrier(CLK_LOCAL_MEM_FENCE);  
+                         barrier(CLK_LOCAL_MEM_FENCE);
                         for(int i=0;i<lcl_compute_win&&(lcl_compute_id==0);i++)
                         {
                              stage_sum+=partialsum[lcl_id+i];
-                         } 
+                         }
                         if(stage_sum>=stagethreshold&&(lcl_compute_id==0))
                         {
                              int queueindex=atomic_inc(lclcount);
@@ -298,8 +298,8 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
                     candidate[outputoff+temp+lcl_id]=candidate_result;
                 }
                 barrier(CLK_LOCAL_MEM_FENCE);
-			}
-		}
+            }
+        }
   }
 }
 __kernel void gpuscaleclassifier(global GpuHidHaarTreeNode * orinode, global GpuHidHaarTreeNode * newnode,float scale,float weight_scale,int nodenum)
--- a/modules/ocl/src/kernels/imgproc_bilateral.cl
+++ b/modules/ocl/src/kernels/imgproc_bilateral.cl
@@ -33,106 +33,106 @@
 // the use of this software, even if advised of the possibility of such damage.

 __kernel void bilateral_C1_D0(__global uchar *dst,
-		__global const uchar *src,
-		const int dst_rows,
-		const int dst_cols,
-		const int maxk,
-		const int radius,
-		const int dst_step,
-		const int dst_offset,
-		const int src_step,
-		const int src_rows,
-		const int src_cols,
-		__constant float *color_weight,
-		__constant float *space_weight,
-		__constant int *space_ofs)
-{	
-	int gidx = get_global_id(0);
-	int gidy = get_global_id(1);
-	if((gidy<dst_rows) && (gidx<dst_cols))
-	{
-		int src_addr = mad24(gidy+radius,src_step,gidx+radius);
-		int dst_addr = mad24(gidy,dst_step,gidx+dst_offset);
-		float sum = 0.f, wsum = 0.f;
+        __global const uchar *src,
+        const int dst_rows,
+        const int dst_cols,
+        const int maxk,
+        const int radius,
+        const int dst_step,
+        const int dst_offset,
+        const int src_step,
+        const int src_rows,
+        const int src_cols,
+        __constant float *color_weight,
+        __constant float *space_weight,
+        __constant int *space_ofs)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+    if((gidy<dst_rows) && (gidx<dst_cols))
+    {
+        int src_addr = mad24(gidy+radius,src_step,gidx+radius);
+        int dst_addr = mad24(gidy,dst_step,gidx+dst_offset);
+        float sum = 0.f, wsum = 0.f;

-		int val0 = (int)src[src_addr];
-		for(int k = 0; k < maxk; k++ )
-		{
-			int val = (int)src[src_addr + space_ofs[k]];
-			float w = space_weight[k]*color_weight[abs(val - val0)];
-			sum += (float)(val)*w;
-			wsum += w;
-		}
-		dst[dst_addr] = convert_uchar_rtz(sum/wsum+0.5f);
-	}
+        int val0 = (int)src[src_addr];
+        for(int k = 0; k < maxk; k++ )
+        {
+            int val = (int)src[src_addr + space_ofs[k]];
+            float w = space_weight[k]*color_weight[abs(val - val0)];
+            sum += (float)(val)*w;
+            wsum += w;
+        }
+        dst[dst_addr] = convert_uchar_rtz(sum/wsum+0.5f);
+    }
 }
 __kernel void bilateral2_C1_D0(__global uchar *dst,
-		__global const uchar *src,
-		const int dst_rows,
-		const int dst_cols,
-		const int maxk,
-		const int radius,
-		const int dst_step,
-		const int dst_offset,
-		const int src_step,
-		const int src_rows,
-		const int src_cols,
-		__constant float *color_weight,
-		__constant float *space_weight,
-		__constant int *space_ofs)
-{	
-	int gidx = get_global_id(0)<<2;
-	int gidy = get_global_id(1);
-	if((gidy<dst_rows) && (gidx<dst_cols))
-	{
-		int src_addr = mad24(gidy+radius,src_step,gidx+radius);
-		int dst_addr = mad24(gidy,dst_step,gidx+dst_offset);
-		float4 sum = (float4)(0.f), wsum = (float4)(0.f);
+        __global const uchar *src,
+        const int dst_rows,
+        const int dst_cols,
+        const int maxk,
+        const int radius,
+        const int dst_step,
+        const int dst_offset,
+        const int src_step,
+        const int src_rows,
+        const int src_cols,
+        __constant float *color_weight,
+        __constant float *space_weight,
+        __constant int *space_ofs)
+{
+    int gidx = get_global_id(0)<<2;
+    int gidy = get_global_id(1);
+    if((gidy<dst_rows) && (gidx<dst_cols))
+    {
+        int src_addr = mad24(gidy+radius,src_step,gidx+radius);
+        int dst_addr = mad24(gidy,dst_step,gidx+dst_offset);
+        float4 sum = (float4)(0.f), wsum = (float4)(0.f);

-		int4 val0 = convert_int4(vload4(0,src+src_addr));
-		for(int k = 0; k < maxk; k++ )
-		{
-			int4 val = convert_int4(vload4(0,src+src_addr + space_ofs[k]));
-			float4 w = (float4)(space_weight[k])*(float4)(color_weight[abs(val.x - val0.x)],color_weight[abs(val.y - val0.y)],color_weight[abs(val.z - val0.z)],color_weight[abs(val.w - val0.w)]);
-			sum += convert_float4(val)*w;
-			wsum += w;
-		}
-		*(__global uchar4*)(dst+dst_addr) = convert_uchar4_rtz(sum/wsum+0.5f);
-	}
+        int4 val0 = convert_int4(vload4(0,src+src_addr));
+        for(int k = 0; k < maxk; k++ )
+        {
+            int4 val = convert_int4(vload4(0,src+src_addr + space_ofs[k]));
+            float4 w = (float4)(space_weight[k])*(float4)(color_weight[abs(val.x - val0.x)],color_weight[abs(val.y - val0.y)],color_weight[abs(val.z - val0.z)],color_weight[abs(val.w - val0.w)]);
+            sum += convert_float4(val)*w;
+            wsum += w;
+        }
+        *(__global uchar4*)(dst+dst_addr) = convert_uchar4_rtz(sum/wsum+0.5f);
+    }
 }
 __kernel void bilateral_C4_D0(__global uchar4 *dst,
-		__global const uchar4 *src,
-		const int dst_rows,
-		const int dst_cols,
-		const int maxk,
-		const int radius,
-		const int dst_step,
-		const int dst_offset,
-		const int src_step,
-		const int src_rows,
-		const int src_cols,
-		__constant float *color_weight,
-		__constant float *space_weight,
-		__constant int *space_ofs)
-{	
-	int gidx = get_global_id(0);
-	int gidy = get_global_id(1);
-	if((gidy<dst_rows) && (gidx<dst_cols))
-	{
-		int src_addr = mad24(gidy+radius,src_step,gidx+radius);
-		int dst_addr = mad24(gidy,dst_step,gidx+dst_offset);
-		float4 sum = (float4)0.f;
-		float wsum = 0.f;
+        __global const uchar4 *src,
+        const int dst_rows,
+        const int dst_cols,
+        const int maxk,
+        const int radius,
+        const int dst_step,
+        const int dst_offset,
+        const int src_step,
+        const int src_rows,
+        const int src_cols,
+        __constant float *color_weight,
+        __constant float *space_weight,
+        __constant int *space_ofs)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+    if((gidy<dst_rows) && (gidx<dst_cols))
+    {
+        int src_addr = mad24(gidy+radius,src_step,gidx+radius);
+        int dst_addr = mad24(gidy,dst_step,gidx+dst_offset);
+        float4 sum = (float4)0.f;
+        float wsum = 0.f;

-		int4 val0 = convert_int4(src[src_addr]);
-		for(int k = 0; k < maxk; k++ )
-		{
-			int4 val = convert_int4(src[src_addr + space_ofs[k]]);
-			float w = space_weight[k]*color_weight[abs(val.x - val0.x)+abs(val.y - val0.y)+abs(val.z - val0.z)];
-			sum += convert_float4(val)*(float4)w;
-			wsum += w;
-		}
-		wsum=1.f/wsum;
-		dst[dst_addr] = convert_uchar4_rtz(sum*(float4)wsum+(float4)0.5f);
-	}
+        int4 val0 = convert_int4(src[src_addr]);
+        for(int k = 0; k < maxk; k++ )
+        {
+            int4 val = convert_int4(src[src_addr + space_ofs[k]]);
+            float w = space_weight[k]*color_weight[abs(val.x - val0.x)+abs(val.y - val0.y)+abs(val.z - val0.z)];
+            sum += convert_float4(val)*(float4)w;
+            wsum += w;
+        }
+        wsum=1.f/wsum;
+        dst[dst_addr] = convert_uchar4_rtz(sum*(float4)wsum+(float4)0.5f);
+    }
 }
--- a/modules/ocl/src/kernels/imgproc_calcHarris.cl
+++ b/modules/ocl/src/kernels/imgproc_calcHarris.cl
@@ -53,8 +53,8 @@
 //BORDER_REPLICATE:     aaaaaa|abcdefgh|hhhhhhh
 #define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (l_edge)   : (i))
 #define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (r_edge)-1 : (addr))
-#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (t_edge)   :(i)) 
-#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (b_edge)-1 :(addr)) 
+#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (t_edge)   :(i))
+#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (b_edge)-1 :(addr))
 #endif

 #ifdef BORDER_REFLECT
@@ -120,10 +120,10 @@ __kernel void calcHarris(__global const float *Dx,__global const float *Dy, __gl
    for(int i=0; i < ksY+1; i++)
    {
        dx_con = dx_startX+col >= 0 && dx_startX+col < dx_whole_cols && dx_startY+i >= 0 && dx_startY+i < dx_whole_rows;
-        dx_s = Dx[(dx_startY+i)*(dx_step>>2)+(dx_startX+col)]; 
+        dx_s = Dx[(dx_startY+i)*(dx_step>>2)+(dx_startX+col)];
        dx_data[i] = dx_con ? dx_s : 0.0;
        dy_con = dy_startX+col >= 0 && dy_startX+col < dy_whole_cols && dy_startY+i >= 0 && dy_startY+i < dy_whole_rows;
-        dy_s = Dy[(dy_startY+i)*(dy_step>>2)+(dy_startX+col)]; 
+        dy_s = Dy[(dy_startY+i)*(dy_step>>2)+(dy_startX+col)];
        dy_data[i] = dy_con ? dy_s : 0.0;
        data[0][i] = dx_data[i] * dx_data[i];
        data[1][i] = dx_data[i] * dy_data[i];
@@ -139,7 +139,7 @@ __kernel void calcHarris(__global const float *Dx,__global const float *Dy, __gl
        dx_selected_col = ADDR_L(dx_startX+col, 0, dx_whole_cols);
        dx_selected_col = ADDR_R(dx_startX+col, dx_whole_cols, dx_selected_col);
        dx_data[i] = Dx[dx_selected_row * (dx_step>>2) + dx_selected_col];
-        
+
        int dy_selected_row;
        int dy_selected_col;
        dy_selected_row = ADDR_H(dy_startY+i, 0, dy_whole_rows);
@@ -147,7 +147,7 @@ __kernel void calcHarris(__global const float *Dx,__global const float *Dy, __gl
        dy_selected_col = ADDR_L(dy_startX+col, 0, dy_whole_cols);
        dy_selected_col = ADDR_R(dy_startX+col, dy_whole_cols, dy_selected_col);
        dy_data[i] = Dy[dy_selected_row * (dy_step>>2) + dy_selected_col];
-       
+
        data[0][i] = dx_data[i] * dx_data[i];
        data[1][i] = dx_data[i] * dy_data[i];
        data[2][i] = dy_data[i] * dy_data[i];
@@ -189,12 +189,12 @@ __kernel void calcHarris(__global const float *Dx,__global const float *Dy, __gl

        if(posX < dst_cols && (posY) < dst_rows)
        {
-            dst[(dst_startY+0) * (dst_step>>2)+ dst_startX + col - anX] = 
+            dst[(dst_startY+0) * (dst_step>>2)+ dst_startX + col - anX] =
                    tmp_sum[0] * tmp_sum[4] - tmp_sum[2] * tmp_sum[2] - k * (tmp_sum[0] + tmp_sum[4]) * (tmp_sum[0] + tmp_sum[4]);
        }
        if(posX < dst_cols && (posY + 1) < dst_rows)
        {
-            dst[(dst_startY+1) * (dst_step>>2)+ dst_startX + col - anX] = 
+            dst[(dst_startY+1) * (dst_step>>2)+ dst_startX + col - anX] =
                    tmp_sum[1] * tmp_sum[5] - tmp_sum[3] * tmp_sum[3] - k * (tmp_sum[1] + tmp_sum[5]) * (tmp_sum[1] + tmp_sum[5]);
        }
    }
--- a/modules/ocl/src/kernels/imgproc_calcMinEigenVal.cl
+++ b/modules/ocl/src/kernels/imgproc_calcMinEigenVal.cl
@@ -53,8 +53,8 @@
 //BORDER_REPLICATE:     aaaaaa|abcdefgh|hhhhhhh
 #define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (l_edge)   : (i))
 #define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (r_edge)-1 : (addr))
-#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (t_edge)   :(i)) 
-#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (b_edge)-1 :(addr)) 
+#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (t_edge)   :(i))
+#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (b_edge)-1 :(addr))
 #endif

 #ifdef BORDER_REFLECT
@@ -120,10 +120,10 @@ __kernel void calcMinEigenVal(__global const float *Dx,__global const float *Dy,
    for(int i=0; i < ksY+1; i++)
    {
        dx_con = dx_startX+col >= 0 && dx_startX+col < dx_whole_cols && dx_startY+i >= 0 && dx_startY+i < dx_whole_rows;
-        dx_s = Dx[(dx_startY+i)*(dx_step>>2)+(dx_startX+col)]; 
+        dx_s = Dx[(dx_startY+i)*(dx_step>>2)+(dx_startX+col)];
        dx_data[i] = dx_con ? dx_s : 0.0;
        dy_con = dy_startX+col >= 0 && dy_startX+col < dy_whole_cols && dy_startY+i >= 0 && dy_startY+i < dy_whole_rows;
-        dy_s = Dy[(dy_startY+i)*(dy_step>>2)+(dy_startX+col)]; 
+        dy_s = Dy[(dy_startY+i)*(dy_step>>2)+(dy_startX+col)];
        dy_data[i] = dy_con ? dy_s : 0.0;
        data[0][i] = dx_data[i] * dx_data[i];
        data[1][i] = dx_data[i] * dy_data[i];
@@ -139,7 +139,7 @@ __kernel void calcMinEigenVal(__global const float *Dx,__global const float *Dy,
        dx_selected_col = ADDR_L(dx_startX+col, 0, dx_whole_cols);
        dx_selected_col = ADDR_R(dx_startX+col, dx_whole_cols, dx_selected_col);
        dx_data[i] = Dx[dx_selected_row * (dx_step>>2) + dx_selected_col];
-        
+
        int dy_selected_row;
        int dy_selected_col;
        dy_selected_row = ADDR_H(dy_startY+i, 0, dy_whole_rows);
@@ -147,7 +147,7 @@ __kernel void calcMinEigenVal(__global const float *Dx,__global const float *Dy,
        dy_selected_col = ADDR_L(dy_startX+col, 0, dy_whole_cols);
        dy_selected_col = ADDR_R(dy_startX+col, dy_whole_cols, dy_selected_col);
        dy_data[i] = Dy[dy_selected_row * (dy_step>>2) + dy_selected_col];
-       
+
        data[0][i] = dx_data[i] * dx_data[i];
        data[1][i] = dx_data[i] * dy_data[i];
        data[2][i] = dy_data[i] * dy_data[i];
--- a/modules/ocl/src/kernels/imgproc_canny.cl
+++ b/modules/ocl/src/kernels/imgproc_canny.cl
@@ -56,19 +56,19 @@ inline float calc(int x, int y)
 {
    return (float)abs(x) + abs(y);
 }
-#endif // 
+#endif //

 // Smoothing perpendicular to the derivative direction with a triangle filter
-// only support 3x3 Sobel kernel 
+// only support 3x3 Sobel kernel
 // h (-1) =  1, h (0) =  2, h (1) =  1
 // h'(-1) = -1, h'(0) =  0, h'(1) =  1
 // thus sobel 2D operator can be calculated as:
 // h'(x, y) = h'(x)h(y) for x direction
-// 
+//
 // src		input 8bit single channel image data
 // dx_buf	output dx buffer
 // dy_buf	output dy buffer
-__kernel 
+__kernel
    void calcSobelRowPass
    (
    __global const uchar * src,
@@ -99,11 +99,11 @@ __kernel

    __local int smem[16][18];

-    smem[lidy][lidx + 1] = src[gidx + gidy * src_step + src_offset]; 
+    smem[lidy][lidx + 1] = src[gidx + gidy * src_step + src_offset];
    if(lidx == 0)
    {
        smem[lidy][0]  = src[max(gidx - 1,  0)        + gidy * src_step + src_offset];
-        smem[lidy][17] = src[min(gidx + 16, cols - 1) + gidy * src_step + src_offset]; 
+        smem[lidy][17] = src[min(gidx + 16, cols - 1) + gidy * src_step + src_offset];
    }
    barrier(CLK_LOCAL_MEM_FENCE);

@@ -122,7 +122,7 @@ __kernel

 // calculate the magnitude of the filter pass combining both x and y directions
 // This is the buffered version(3x3 sobel)
-// 
+//
 // dx_buf		dx buffer, calculated from calcSobelRowPass
 // dy_buf		dy buffer, calculated from calcSobelRowPass
 // dx			direvitive in x direction output
@@ -169,7 +169,7 @@ __kernel

    __local int sdx[18][16];
    __local int sdy[18][16];
-    
+
    sdx[lidy + 1][lidx] = dx_buf[gidx + gidy * dx_buf_step + dx_buf_offset];
    sdy[lidy + 1][lidx] = dy_buf[gidx + gidy * dy_buf_step + dy_buf_offset];
    if(lidy == 0)
@@ -199,7 +199,7 @@ __kernel

 // calculate the magnitude of the filter pass combining both x and y directions
 // This is the non-buffered version(non-3x3 sobel)
-// 
+//
 // dx_buf		dx buffer, calculated from calcSobelRowPass
 // dy_buf		dy buffer, calculated from calcSobelRowPass
 // dx			direvitive in x direction output
@@ -233,9 +233,9 @@ __kernel

    if(gidy < rows && gidx < cols)
    {
-        mag[(gidx + 1) + (gidy + 1) * mag_step + mag_offset] = 
+        mag[(gidx + 1) + (gidy + 1) * mag_step + mag_offset] =
            calc(
-            dx[gidx + gidy * dx_step + dx_offset], 
+            dx[gidx + gidy * dx_step + dx_offset],
            dy[gidx + gidy * dy_step + dy_offset]
        );
    }
@@ -251,7 +251,7 @@ __kernel
 // 0 - below low thres, not an edge
 // 1 - maybe an edge
 // 2 - is an edge, either magnitude is greater than high thres, or
-//     Given estimates of the image gradients, a search is then carried out 
+//     Given estimates of the image gradients, a search is then carried out
 //     to determine if the gradient magnitude assumes a local maximum in the gradient direction.
 //     if the rounded gradient angle is zero degrees (i.e. the edge is in the north-south direction) the point will be considered to be on the edge if its gradient magnitude is greater than the magnitudes in the west and east directions,
 //     if the rounded gradient angle is 90 degrees (i.e. the edge is in the east-west direction) the point will be considered to be on the edge if its gradient magnitude is greater than the magnitudes in the north and south directions,
@@ -265,7 +265,7 @@ __kernel
    void calcMap
    (
    __global const int * dx,
-    __global const int * dy, 
+    __global const int * dy,
    __global const float * mag,
    __global int * map,
    int rows,
@@ -362,10 +362,10 @@ __kernel

 // non local memory version
 __kernel
-    void calcMap_2 
+    void calcMap_2
    (
    __global const int * dx,
-    __global const int * dy, 
+    __global const int * dy,
    __global const float * mag,
    __global int * map,
    int rows,
@@ -444,7 +444,7 @@ __kernel
    void calcMap_3
    (
    __global const int * dx,
-    __global const int * dy, 
+    __global const int * dy,
    __global const float * mag,
    __global int * map,
    int rows,
@@ -550,9 +550,9 @@ __kernel
 //
 // If candidate pixel (edge type is 1) has a neighbour pixel (in 3x3 area) with type 2, it is believed to be part of an edge and
 // marked as edge. Each thread will iterate for 16 times to connect local edges.
-// Candidate pixel being identified as edge will then be tested if there is nearby potiential edge points. If there is, counter will 
+// Candidate pixel being identified as edge will then be tested if there is nearby potiential edge points. If there is, counter will
 // be incremented by 1 and the point location is stored. These potiential candidates will be processed further in next kernel.
-// 
+//
 // map		raw edge type results calculated from calcMap.
 // st		the potiential edge points found in this kernel call
 // counter	the number of potiential edge points
@@ -560,7 +560,7 @@ __kernel
    void edgesHysteresisLocal
    (
    __global int * map,
-    __global ushort2 * st, 
+    __global ushort2 * st,
    volatile __global unsigned int * counter,
    int rows,
    int cols,
@@ -657,8 +657,8 @@ __kernel
    void edgesHysteresisGlobal
    (
    __global int * map,
-    __global ushort2 * st1, 
-    __global ushort2 * st2, 
+    __global ushort2 * st1,
+    __global ushort2 * st2,
    volatile __global int * counter,
    int rows,
    int cols,
--- a/modules/ocl/src/kernels/imgproc_columnsum.cl
+++ b/modules/ocl/src/kernels/imgproc_columnsum.cl
@@ -57,24 +57,24 @@
 /// CV_32FC1
 __kernel void columnSum_C1_D5(__global float* src,__global float* dst,int srcCols,int srcRows,int srcStep,int dstStep)
 {
-	const int x = get_global_id(0);
-	
-	srcStep >>= 2;
-	dstStep >>= 2;
+    const int x = get_global_id(0);

-	if (x < srcCols)
+    srcStep >>= 2;
+    dstStep >>= 2;
+
+    if (x < srcCols)
    {
-		int srcIdx = x ;
-		int dstIdx = x ;
+        int srcIdx = x ;
+        int dstIdx = x ;

        float sum = 0;
-		
+
        for (int y = 0; y < srcRows; ++y)
        {
-			sum += src[srcIdx];
+            sum += src[srcIdx];
            dst[dstIdx] = sum;
-			srcIdx += srcStep;
-			dstIdx += dstStep;	
+            srcIdx += srcStep;
+            dstIdx += dstStep;
        }
-	}
+    }
 }
--- a/modules/ocl/src/kernels/imgproc_convolve.cl
+++ b/modules/ocl/src/kernels/imgproc_convolve.cl
@@ -53,7 +53,7 @@ __kernel void convolve_D5 (__global float *src, __global float *temp1, __global
                                  int rows, int cols, int src_step, int dst_step,int k_step, int kWidth, int kHeight)
 {
    __local float smem[16 + 2 * 8][16 + 2 * 8];
-    
+
    int x = get_local_id(0);
    int y = get_local_id(1);
    int gx = get_global_id(0);
@@ -92,7 +92,7 @@ __kernel void convolve_D5 (__global float *src, __global float *temp1, __global
    smem[y + 16][x + 16] = src[min(gy + 8, rows - 1)*(src_step >> 2) + min(gx + 8, cols - 1)];

    barrier(CLK_LOCAL_MEM_FENCE);
- 
+
    if (gx < cols && gy < rows)
    {
       float res = 0;
--- a/modules/ocl/src/kernels/imgproc_copymakeboder.cl
+++ b/modules/ocl/src/kernels/imgproc_copymakeboder.cl
@@ -65,136 +65,136 @@
 #endif

 __kernel void copymakeborder
-						(__global const GENTYPE *src, 
-						 __global GENTYPE *dst,
+                        (__global const GENTYPE *src,
+                         __global GENTYPE *dst,
                         const int dst_cols,
-                         const int dst_rows, 
-						 const int src_cols,
-						 const int src_rows,
-                         const int src_step_in_pixel, 
-                         const int src_offset_in_pixel, 
-                         const int dst_step_in_pixel,						 
-						 const int dst_offset_in_pixel,
+                         const int dst_rows,
+                         const int src_cols,
+                         const int src_rows,
+                         const int src_step_in_pixel,
+                         const int src_offset_in_pixel,
+                         const int dst_step_in_pixel,
+                         const int dst_offset_in_pixel,
                         const int top,
-						 const int left,
-						 const GENTYPE val
+                         const int left,
+                         const GENTYPE val
                         )
 {
-	int x = get_global_id(0);
-	int y = get_global_id(1);
-	int src_x = x-left;
-	int src_y = y-top;
-	int src_addr = mad24(src_y,src_step_in_pixel,src_x+src_offset_in_pixel);
-	int dst_addr = mad24(y,dst_step_in_pixel,x+dst_offset_in_pixel);
-	int con = (src_x >= 0) && (src_x < src_cols) && (src_y >= 0) && (src_y < src_rows);
-	if(con)
-	{
-		dst[dst_addr] = src[src_addr];
-	}
-	else
-	{
-	#ifdef BORDER_CONSTANT
-		//write the result to dst
-		if((x<dst_cols) && (y<dst_rows))
-		{
-			dst[dst_addr] = val;
-		}
-	#else
-		int s_x,s_y;
-		//judge if read out of boundary
-		s_x= ADDR_L(src_x,0,src_cols,src_x);
-		s_x= ADDR_R(src_x,src_cols,s_x);
-		s_y= ADDR_L(src_y,0,src_rows,src_y);
-		s_y= ADDR_R(src_y,src_rows,s_y);
-		src_addr=mad24(s_y,src_step_in_pixel,s_x+src_offset_in_pixel);
-		//write the result to dst
-		if((x<dst_cols) && (y<dst_rows))
-		{
-			dst[dst_addr] = src[src_addr];
-		}
-	#endif
-	}
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    int src_x = x-left;
+    int src_y = y-top;
+    int src_addr = mad24(src_y,src_step_in_pixel,src_x+src_offset_in_pixel);
+    int dst_addr = mad24(y,dst_step_in_pixel,x+dst_offset_in_pixel);
+    int con = (src_x >= 0) && (src_x < src_cols) && (src_y >= 0) && (src_y < src_rows);
+    if(con)
+    {
+        dst[dst_addr] = src[src_addr];
+    }
+    else
+    {
+    #ifdef BORDER_CONSTANT
+        //write the result to dst
+        if((x<dst_cols) && (y<dst_rows))
+        {
+            dst[dst_addr] = val;
+        }
+    #else
+        int s_x,s_y;
+        //judge if read out of boundary
+        s_x= ADDR_L(src_x,0,src_cols,src_x);
+        s_x= ADDR_R(src_x,src_cols,s_x);
+        s_y= ADDR_L(src_y,0,src_rows,src_y);
+        s_y= ADDR_R(src_y,src_rows,s_y);
+        src_addr=mad24(s_y,src_step_in_pixel,s_x+src_offset_in_pixel);
+        //write the result to dst
+        if((x<dst_cols) && (y<dst_rows))
+        {
+            dst[dst_addr] = src[src_addr];
+        }
+    #endif
+    }
 }

 __kernel void copymakeborder_C1_D0
-						(__global const uchar *src, 
-						 __global uchar *dst,
+                        (__global const uchar *src,
+                         __global uchar *dst,
                         const int dst_cols,
-                         const int dst_rows, 
-						 const int src_cols,
-						 const int src_rows,
-                         const int src_step_in_pixel, 
-                         const int src_offset_in_pixel, 
-                         const int dst_step_in_pixel,						 
-						 const int dst_offset_in_pixel,
+                         const int dst_rows,
+                         const int src_cols,
+                         const int src_rows,
+                         const int src_step_in_pixel,
+                         const int src_offset_in_pixel,
+                         const int dst_step_in_pixel,
+                         const int dst_offset_in_pixel,
                         const int top,
-						 const int left,
-						 const uchar val
+                         const int left,
+                         const uchar val
                         )
 {
-	int x = get_global_id(0)<<2;
-	int y = get_global_id(1);
-	int src_x = x-left;
-	int src_y = y-top;
-	int src_addr = mad24(src_y,src_step_in_pixel,src_x+src_offset_in_pixel);
-	int dst_addr = mad24(y,dst_step_in_pixel,x+dst_offset_in_pixel);
-	int con = (src_x >= 0) && (src_x+3 < src_cols) && (src_y >= 0) && (src_y < src_rows);
-	if(con)
-	{
-		uchar4 tmp = vload4(0,src+src_addr);
-		*(__global uchar4*)(dst+dst_addr) = tmp;
-	}
-	else
-	{
-	#ifdef BORDER_CONSTANT
-		//write the result to dst
-		if((((src_x<0) && (src_x+3>=0))||(src_x < src_cols) && (src_x+3 >= src_cols)) && (src_y >= 0) && (src_y < src_rows))
-		{
-			int4 addr;
-			uchar4 tmp;
-			addr.x = ((src_x < 0) || (src_x>= src_cols)) ? 0 : src_addr;
-			addr.y = ((src_x+1 < 0) || (src_x+1>= src_cols)) ? 0 : (src_addr+1);
-			addr.z = ((src_x+2 < 0) || (src_x+2>= src_cols)) ? 0 : (src_addr+2);
-			addr.w = ((src_x+3 < 0) || (src_x+3>= src_cols)) ? 0 : (src_addr+3);
-			tmp.x = src[addr.x];
-			tmp.y = src[addr.y];
-			tmp.z = src[addr.z];
-			tmp.w = src[addr.w];
-			tmp.x = (src_x >=0)&&(src_x  < src_cols) ? tmp.x : val;
-			tmp.y = (src_x+1 >=0)&&(src_x +1 < src_cols) ? tmp.y : val;
-			tmp.z = (src_x+2 >=0)&&(src_x +2 < src_cols) ? tmp.z : val;
-			tmp.w = (src_x+3 >=0)&&(src_x +3 < src_cols) ? tmp.w : val;
-			*(__global uchar4*)(dst+dst_addr) = tmp;
-		}
-		else if((x<dst_cols) && (y<dst_rows))
-		{
-			*(__global uchar4*)(dst+dst_addr) = (uchar4)val;
-		}
-	#else
-		int4 s_x;
-		int s_y;
-		//judge if read out of boundary
-		s_x.x= ADDR_L(src_x,0,src_cols,src_x);
-		s_x.y= ADDR_L(src_x+1,0,src_cols,src_x+1);
-		s_x.z= ADDR_L(src_x+2,0,src_cols,src_x+2);
-		s_x.w= ADDR_L(src_x+3,0,src_cols,src_x+3);
-		s_x.x= ADDR_R(src_x,src_cols,s_x.x);
-		s_x.y= ADDR_R(src_x+1,src_cols,s_x.y);
-		s_x.z= ADDR_R(src_x+2,src_cols,s_x.z);
-		s_x.w= ADDR_R(src_x+3,src_cols,s_x.w);
-		s_y= ADDR_L(src_y,0,src_rows,src_y);
-		s_y= ADDR_R(src_y,src_rows,s_y);
-		int4 src_addr4=mad24((int4)s_y,(int4)src_step_in_pixel,s_x+(int4)src_offset_in_pixel);
-		//write the result to dst
-		if((x<dst_cols) && (y<dst_rows))
-		{
-			uchar4 tmp;
-			tmp.x = src[src_addr4.x];
-			tmp.y = src[src_addr4.y];
-			tmp.z = src[src_addr4.z];
-			tmp.w = src[src_addr4.w];
-			*(__global uchar4*)(dst+dst_addr) = tmp;
-		}
-	#endif
-	}
+    int x = get_global_id(0)<<2;
+    int y = get_global_id(1);
+    int src_x = x-left;
+    int src_y = y-top;
+    int src_addr = mad24(src_y,src_step_in_pixel,src_x+src_offset_in_pixel);
+    int dst_addr = mad24(y,dst_step_in_pixel,x+dst_offset_in_pixel);
+    int con = (src_x >= 0) && (src_x+3 < src_cols) && (src_y >= 0) && (src_y < src_rows);
+    if(con)
+    {
+        uchar4 tmp = vload4(0,src+src_addr);
+        *(__global uchar4*)(dst+dst_addr) = tmp;
+    }
+    else
+    {
+    #ifdef BORDER_CONSTANT
+        //write the result to dst
+        if((((src_x<0) && (src_x+3>=0))||(src_x < src_cols) && (src_x+3 >= src_cols)) && (src_y >= 0) && (src_y < src_rows))
+        {
+            int4 addr;
+            uchar4 tmp;
+            addr.x = ((src_x < 0) || (src_x>= src_cols)) ? 0 : src_addr;
+            addr.y = ((src_x+1 < 0) || (src_x+1>= src_cols)) ? 0 : (src_addr+1);
+            addr.z = ((src_x+2 < 0) || (src_x+2>= src_cols)) ? 0 : (src_addr+2);
+            addr.w = ((src_x+3 < 0) || (src_x+3>= src_cols)) ? 0 : (src_addr+3);
+            tmp.x = src[addr.x];
+            tmp.y = src[addr.y];
+            tmp.z = src[addr.z];
+            tmp.w = src[addr.w];
+            tmp.x = (src_x >=0)&&(src_x  < src_cols) ? tmp.x : val;
+            tmp.y = (src_x+1 >=0)&&(src_x +1 < src_cols) ? tmp.y : val;
+            tmp.z = (src_x+2 >=0)&&(src_x +2 < src_cols) ? tmp.z : val;
+            tmp.w = (src_x+3 >=0)&&(src_x +3 < src_cols) ? tmp.w : val;
+            *(__global uchar4*)(dst+dst_addr) = tmp;
+        }
+        else if((x<dst_cols) && (y<dst_rows))
+        {
+            *(__global uchar4*)(dst+dst_addr) = (uchar4)val;
+        }
+    #else
+        int4 s_x;
+        int s_y;
+        //judge if read out of boundary
+        s_x.x= ADDR_L(src_x,0,src_cols,src_x);
+        s_x.y= ADDR_L(src_x+1,0,src_cols,src_x+1);
+        s_x.z= ADDR_L(src_x+2,0,src_cols,src_x+2);
+        s_x.w= ADDR_L(src_x+3,0,src_cols,src_x+3);
+        s_x.x= ADDR_R(src_x,src_cols,s_x.x);
+        s_x.y= ADDR_R(src_x+1,src_cols,s_x.y);
+        s_x.z= ADDR_R(src_x+2,src_cols,s_x.z);
+        s_x.w= ADDR_R(src_x+3,src_cols,s_x.w);
+        s_y= ADDR_L(src_y,0,src_rows,src_y);
+        s_y= ADDR_R(src_y,src_rows,s_y);
+        int4 src_addr4=mad24((int4)s_y,(int4)src_step_in_pixel,s_x+(int4)src_offset_in_pixel);
+        //write the result to dst
+        if((x<dst_cols) && (y<dst_rows))
+        {
+            uchar4 tmp;
+            tmp.x = src[src_addr4.x];
+            tmp.y = src[src_addr4.y];
+            tmp.z = src[src_addr4.z];
+            tmp.w = src[src_addr4.w];
+            *(__global uchar4*)(dst+dst_addr) = tmp;
+        }
+    #endif
+    }
 }
--- a/modules/ocl/src/kernels/imgproc_histogram.cl
+++ b/modules/ocl/src/kernels/imgproc_histogram.cl
@@ -34,7 +34,7 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //
-#define PARTIAL_HISTOGRAM256_COUNT     (256) 
+#define PARTIAL_HISTOGRAM256_COUNT     (256)
 #define HISTOGRAM256_BIN_COUNT         (256)

 #define HISTOGRAM256_WORK_GROUP_SIZE     (256)
@@ -45,12 +45,12 @@


 __kernel __attribute__((reqd_work_group_size(HISTOGRAM256_BIN_COUNT,1,1)))void calc_sub_hist_D0(
-                                                                      __global const uint4* src, 
-							              int src_step, int src_offset,
+                                                                      __global const uint4* src,
+                                          int src_step, int src_offset,
                                                                      __global int* globalHist,
-                                                                      int dataCount,  int cols, 
-							              int inc_x, int inc_y,
-							              int hist_step)
+                                                                      int dataCount,  int cols,
+                                          int inc_x, int inc_y,
+                                          int hist_step)
 {
        __local int subhist[(HISTOGRAM256_BIN_COUNT << NBANKS_BIT)]; // NBINS*NBANKS
        int gid = get_global_id(0);
@@ -63,7 +63,7 @@ __kernel __attribute__((reqd_work_group_size(HISTOGRAM256_BIN_COUNT,1,1)))void c
        int offset = (lid & (NBANKS-1));// lid % NBANKS
        uint4 data, temp1, temp2, temp3, temp4;
        src += src_offset;
-    
+
        //clear LDS
        for(int i=0, idx=lid; i<(NBANKS >> 2); i++, idx += lsize)
        {
@@ -73,7 +73,7 @@ __kernel __attribute__((reqd_work_group_size(HISTOGRAM256_BIN_COUNT,1,1)))void c
            subhist[idx+=lsize] = 0;
        }
        barrier(CLK_LOCAL_MEM_FENCE);
-        
+
        //read and scatter
        int y = gid/cols;
        int x = gid - mul24(y, cols);
@@ -87,35 +87,35 @@ __kernel __attribute__((reqd_work_group_size(HISTOGRAM256_BIN_COUNT,1,1)))void c
              temp3 = ((data & mask) << NBANKS_BIT) + offset;
              data >>= shift;
              temp4 = ((data & mask) << NBANKS_BIT) + offset;
-   
-              atomic_inc(subhist + temp1.x); 
-              atomic_inc(subhist + temp1.y); 
-              atomic_inc(subhist + temp1.z); 
-              atomic_inc(subhist + temp1.w); 
-   
-              atomic_inc(subhist + temp2.x); 
-              atomic_inc(subhist + temp2.y); 
-              atomic_inc(subhist + temp2.z); 
-              atomic_inc(subhist + temp2.w); 
-   
-              atomic_inc(subhist + temp3.x); 
-              atomic_inc(subhist + temp3.y); 
-              atomic_inc(subhist + temp3.z); 
-              atomic_inc(subhist + temp3.w); 
-    
-              atomic_inc(subhist + temp4.x); 
-              atomic_inc(subhist + temp4.y); 
-              atomic_inc(subhist + temp4.z); 
+
+              atomic_inc(subhist + temp1.x);
+              atomic_inc(subhist + temp1.y);
+              atomic_inc(subhist + temp1.z);
+              atomic_inc(subhist + temp1.w);
+
+              atomic_inc(subhist + temp2.x);
+              atomic_inc(subhist + temp2.y);
+              atomic_inc(subhist + temp2.z);
+              atomic_inc(subhist + temp2.w);
+
+              atomic_inc(subhist + temp3.x);
+              atomic_inc(subhist + temp3.y);
+              atomic_inc(subhist + temp3.z);
+              atomic_inc(subhist + temp3.w);
+
+              atomic_inc(subhist + temp4.x);
+              atomic_inc(subhist + temp4.y);
+              atomic_inc(subhist + temp4.z);
              atomic_inc(subhist + temp4.w);
-   
+
              x += inc_x;
              int off = ((x>=cols) ? -1 : 0);
              x = mad24(off, cols, x);
              y += inc_y - off;
        }
        barrier(CLK_LOCAL_MEM_FENCE);
-        
-        //reduce local banks to single histogram per workgroup 
+
+        //reduce local banks to single histogram per workgroup
        int bin1=0, bin2=0, bin3=0, bin4=0;
        for(int i=0; i<NBANKS; i+=4)
        {
@@ -124,19 +124,19 @@ __kernel __attribute__((reqd_work_group_size(HISTOGRAM256_BIN_COUNT,1,1)))void c
             bin3 += subhist[(lid << NBANKS_BIT) + i+2];
             bin4 += subhist[(lid << NBANKS_BIT) + i+3];
        }
-   
+
        globalHist[mad24(gx, hist_step, lid)] = bin1+bin2+bin3+bin4;
 }

 __kernel void __attribute__((reqd_work_group_size(1,HISTOGRAM256_BIN_COUNT,1)))calc_sub_hist_border_D0(
-                                                                      __global const uchar* src, 
-				                                      int src_step,  int src_offset,
+                                                                      __global const uchar* src,
+                                                      int src_step,  int src_offset,
                                                                      __global int* globalHist,
                                                                      int left_col,  int cols,
-				                                      int rows,	 int hist_step)
+                                                      int rows,	 int hist_step)
 {
-	int gidx = get_global_id(0);
-	int gidy = get_global_id(1);
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
        int lidy = get_local_id(1);
        int gx = get_group_id(0);
        int gy = get_group_id(1);
@@ -160,9 +160,9 @@ __kernel void __attribute__((reqd_work_group_size(1,HISTOGRAM256_BIN_COUNT,1)))c

        globalHist[mad24(rowIndex, hist_step, lidy)] += subhist[lidy];
 }
-__kernel __attribute__((reqd_work_group_size(256,1,1)))void merge_hist(__global int* buf,  
-				__global int* hist,
-				int src_step)
+__kernel __attribute__((reqd_work_group_size(256,1,1)))void merge_hist(__global int* buf,
+                __global int* hist,
+                int src_step)
 {
    int lx = get_local_id(0);
    int gx = get_group_id(0);
@@ -183,83 +183,83 @@ __kernel __attribute__((reqd_work_group_size(256,1,1)))void merge_hist(__global
    }

    if(lx == 0)
-        hist[gx] = data[0]; 
+        hist[gx] = data[0];
 }

 __kernel __attribute__((reqd_work_group_size(256,1,1)))void calLUT(
-							__global uchar * dst,
-							__constant int * hist,
-							float scale)
+                            __global uchar * dst,
+                            __constant int * hist,
+                            float scale)
 {
-	int lid = get_local_id(0);
-	__local int sumhist[HISTOGRAM256_BIN_COUNT];
-	//__local uchar lut[HISTOGRAM256_BIN_COUNT+1];
+    int lid = get_local_id(0);
+    __local int sumhist[HISTOGRAM256_BIN_COUNT];
+    //__local uchar lut[HISTOGRAM256_BIN_COUNT+1];

-	sumhist[lid]=hist[lid];
-	barrier(CLK_LOCAL_MEM_FENCE);
-	if(lid==0)
-	{
-		int sum = 0;
-		for(int i=0;i<HISTOGRAM256_BIN_COUNT;i++)
-		{
-			sum+=sumhist[i];
-			sumhist[i]=sum;
-		}
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-	dst[lid]= lid == 0 ? 0 : convert_uchar_sat(convert_float(sumhist[lid])*scale);
+    sumhist[lid]=hist[lid];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if(lid==0)
+    {
+        int sum = 0;
+        for(int i=0;i<HISTOGRAM256_BIN_COUNT;i++)
+        {
+            sum+=sumhist[i];
+            sumhist[i]=sum;
+        }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    dst[lid]= lid == 0 ? 0 : convert_uchar_sat(convert_float(sumhist[lid])*scale);
 }
 /*
 ///////////////////////////////equalizeHist//////////////////////////////////////////////////
 __kernel __attribute__((reqd_work_group_size(256,1,1)))void equalizeHist(
-							__global uchar * src,
-							__global uchar * dst,
-							__constant int * hist,
-							int srcstep,
-							int srcoffset,
-							int dststep,
-							int dstoffset,
-							int width,
-							int height,
-							float scale,
-							int inc_x,
-							int inc_y)
+                            __global uchar * src,
+                            __global uchar * dst,
+                            __constant int * hist,
+                            int srcstep,
+                            int srcoffset,
+                            int dststep,
+                            int dstoffset,
+                            int width,
+                            int height,
+                            float scale,
+                            int inc_x,
+                            int inc_y)
 {
-	int gidx = get_global_id(0);
-	int lid = get_local_id(0);
-	int glb_size = get_global_size(0);
-	src+=srcoffset;
-	dst+=dstoffset;
-	__local int sumhist[HISTOGRAM256_BIN_COUNT];
-	__local uchar lut[HISTOGRAM256_BIN_COUNT+1];
+    int gidx = get_global_id(0);
+    int lid = get_local_id(0);
+    int glb_size = get_global_size(0);
+    src+=srcoffset;
+    dst+=dstoffset;
+    __local int sumhist[HISTOGRAM256_BIN_COUNT];
+    __local uchar lut[HISTOGRAM256_BIN_COUNT+1];

-	sumhist[lid]=hist[lid];
-	barrier(CLK_LOCAL_MEM_FENCE);
-	if(lid==0)
-	{
-		int sum = 0;
-		for(int i=0;i<HISTOGRAM256_BIN_COUNT;i++)
-		{
-			sum+=sumhist[i];
-			sumhist[i]=sum;
-		}
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-	lut[lid]= convert_uchar_sat(convert_float(sumhist[lid])*scale);
-	lut[0]=0;
+    sumhist[lid]=hist[lid];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if(lid==0)
+    {
+        int sum = 0;
+        for(int i=0;i<HISTOGRAM256_BIN_COUNT;i++)
+        {
+            sum+=sumhist[i];
+            sumhist[i]=sum;
+        }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    lut[lid]= convert_uchar_sat(convert_float(sumhist[lid])*scale);
+    lut[0]=0;
    int pos_y = gidx / width;
    int pos_x = gidx - mul24(pos_y, width);

    for(int pos = gidx; pos < mul24(width,height); pos += glb_size)
-	{
-		int inaddr = mad24(pos_y,srcstep,pos_x);
-		int outaddr = mad24(pos_y,dststep,pos_x);
-		dst[outaddr] = lut[src[inaddr]];
-		pos_x +=inc_x;
-		int off = (pos_x >= width ? -1 : 0);
-		pos_x =  mad24(off,width,pos_x);
-		pos_y += inc_y - off;
-	}
+    {
+        int inaddr = mad24(pos_y,srcstep,pos_x);
+        int outaddr = mad24(pos_y,dststep,pos_x);
+        dst[outaddr] = lut[src[inaddr]];
+        pos_x +=inc_x;
+        int off = (pos_x >= width ? -1 : 0);
+        pos_x =  mad24(off,width,pos_x);
+        pos_y += inc_y - off;
+    }
 }
 */

--- a/modules/ocl/src/kernels/imgproc_integral.cl
+++ b/modules/ocl/src/kernels/imgproc_integral.cl
@@ -73,27 +73,27 @@ kernel void integral_cols(__global uchar4 *src,__global int *sum ,__global float
    {
        src_t[0] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + gid]) : 0);
        src_t[1] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + gid + 1]) : 0);
-        
+
        sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
        sqsum_t[0] = (i == 0 ? 0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
        sum_t[1] =  (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
        sqsum_t[1] =  (i == 0 ? 0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]);
        barrier(CLK_LOCAL_MEM_FENCE);
-        
+
        int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
        lm_sum[0][bf_loc] = src_t[0];
        lm_sqsum[0][bf_loc] = convert_float4(src_t[0] * src_t[0]);

        lm_sum[1][bf_loc] = src_t[1];
        lm_sqsum[1][bf_loc] = convert_float4(src_t[1] * src_t[1]);
-        
+
        int offset = 1;
        for(int d = LSIZE >> 1 ;  d > 0; d>>=1)
        {
            barrier(CLK_LOCAL_MEM_FENCE);
            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
-            ai += GET_CONFLICT_OFFSET(ai); 
-            bi += GET_CONFLICT_OFFSET(bi); 
+            ai += GET_CONFLICT_OFFSET(ai);
+            bi += GET_CONFLICT_OFFSET(bi);

            if((lid & 127) < d)
            {
@@ -102,7 +102,7 @@ kernel void integral_cols(__global uchar4 *src,__global int *sum ,__global float
            }
            offset <<= 1;
        }
-		barrier(CLK_LOCAL_MEM_FENCE);
+        barrier(CLK_LOCAL_MEM_FENCE);
        if(lid < 2)
        {
            lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
@@ -113,23 +113,23 @@ kernel void integral_cols(__global uchar4 *src,__global int *sum ,__global float
            barrier(CLK_LOCAL_MEM_FENCE);
            offset >>= 1;
            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
-            ai += GET_CONFLICT_OFFSET(ai); 
-            bi += GET_CONFLICT_OFFSET(bi); 
-            
+            ai += GET_CONFLICT_OFFSET(ai);
+            bi += GET_CONFLICT_OFFSET(bi);
+
            if((lid & 127) < d)
            {
                lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
                lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
-                
+
                lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai];
                lm_sqsum[lid >> 7][ai] = lm_sqsum[lid >> 7][bi] - lm_sqsum[lid >> 7][ai];
            }
        }
-		barrier(CLK_LOCAL_MEM_FENCE);
+        barrier(CLK_LOCAL_MEM_FENCE);
        int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ;
        if(lid > 0 && (i+lid) <= rows){
-            lm_sum[0][bf_loc] += sum_t[0]; 
-            lm_sum[1][bf_loc] += sum_t[1]; 
+            lm_sum[0][bf_loc] += sum_t[0];
+            lm_sum[1][bf_loc] += sum_t[1];
            lm_sqsum[0][bf_loc] += sqsum_t[0];
            lm_sqsum[1][bf_loc] += sqsum_t[1];
            sum_p = (__local int*)(&(lm_sum[0][bf_loc]));
@@ -139,7 +139,7 @@ kernel void integral_cols(__global uchar4 *src,__global int *sum ,__global float
                if(gid * 4 + k >= cols + pre_invalid || gid * 4 + k < pre_invalid) continue;
                sum[loc_s0 + k * dst_step / 4] = sum_p[k];
                sqsum[loc_s0 + k * dst_step / 4] = sqsum_p[k];
-            } 
+            }
            sum_p = (__local int*)(&(lm_sum[1][bf_loc]));
            sqsum_p = (__local float*)(&(lm_sqsum[1][bf_loc]));
            for(int k = 0; k < 4; k++)
@@ -147,7 +147,7 @@ kernel void integral_cols(__global uchar4 *src,__global int *sum ,__global float
                if(gid * 4 + k + 4 >= cols + pre_invalid) break;
                sum[loc_s1 + k * dst_step / 4] = sum_p[k];
                sqsum[loc_s1 + k * dst_step / 4] = sqsum_p[k];
-            } 
+            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
    }
@@ -173,27 +173,27 @@ kernel void integral_rows(__global int4 *srcsum,__global float4 * srcsqsum,__glo
        sqsrc_t[0] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2] : 0;
        src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : 0;
        sqsrc_t[1] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2 + 1] : 0;
-        
+
        sum_t[0] =  (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
        sqsum_t[0] =  (i == 0 ? 0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
        sum_t[1] =  (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
        sqsum_t[1] =  (i == 0 ? 0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]);
        barrier(CLK_LOCAL_MEM_FENCE);
-        
+
        int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
        lm_sum[0][bf_loc] = src_t[0];
        lm_sqsum[0][bf_loc] = sqsrc_t[0];
-            
+
        lm_sum[1][bf_loc] = src_t[1];
        lm_sqsum[1][bf_loc] = sqsrc_t[1];
-        
+
        int offset = 1;
        for(int d = LSIZE >> 1 ;  d > 0; d>>=1)
        {
            barrier(CLK_LOCAL_MEM_FENCE);
            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
-            ai += GET_CONFLICT_OFFSET(ai); 
-            bi += GET_CONFLICT_OFFSET(bi); 
+            ai += GET_CONFLICT_OFFSET(ai);
+            bi += GET_CONFLICT_OFFSET(bi);

            if((lid & 127) < d)
            {
@@ -202,7 +202,7 @@ kernel void integral_rows(__global int4 *srcsum,__global float4 * srcsqsum,__glo
            }
            offset <<= 1;
        }
-		barrier(CLK_LOCAL_MEM_FENCE);
+        barrier(CLK_LOCAL_MEM_FENCE);
        if(lid < 2)
        {
            lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
@@ -213,14 +213,14 @@ kernel void integral_rows(__global int4 *srcsum,__global float4 * srcsqsum,__glo
            barrier(CLK_LOCAL_MEM_FENCE);
            offset >>= 1;
            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
-            ai += GET_CONFLICT_OFFSET(ai); 
-            bi += GET_CONFLICT_OFFSET(bi); 
-            
+            ai += GET_CONFLICT_OFFSET(ai);
+            bi += GET_CONFLICT_OFFSET(bi);
+
            if((lid & 127) < d)
            {
                lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
                lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
-                
+
                lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai];
                lm_sqsum[lid >> 7][ai] = lm_sqsum[lid >> 7][bi] - lm_sqsum[lid >> 7][ai];
            }
@@ -235,7 +235,7 @@ kernel void integral_rows(__global int4 *srcsum,__global float4 * srcsqsum,__glo
        {
            int loc0 = gid * 2 * sum_step;
            int loc1 = gid * 2 * sqsum_step;
-            for(int k = 1;k <= 8;k++) 
+            for(int k = 1;k <= 8;k++)
            {
                if(gid * 8 + k > cols) break;
                sum[sum_offset + loc0 + k * sum_step / 4] = 0;
@@ -245,8 +245,8 @@ kernel void integral_rows(__global int4 *srcsum,__global float4 * srcsqsum,__glo
        int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ;
        int loc_sq0 = sqsum_offset + gid * 2 * sqsum_step + sqsum_step / 4 + i + lid, loc_sq1 = loc_sq0 + sqsum_step ;
        if(lid > 0 && (i+lid) <= rows){
-            lm_sum[0][bf_loc] += sum_t[0]; 
-            lm_sum[1][bf_loc] += sum_t[1]; 
+            lm_sum[0][bf_loc] += sum_t[0];
+            lm_sum[1][bf_loc] += sum_t[1];
            lm_sqsum[0][bf_loc] += sqsum_t[0];
            lm_sqsum[1][bf_loc] += sqsum_t[1];
            sum_p = (__local int*)(&(lm_sum[0][bf_loc]));
@@ -256,7 +256,7 @@ kernel void integral_rows(__global int4 *srcsum,__global float4 * srcsqsum,__glo
                if(gid * 8 + k >= cols) break;
                sum[loc_s0 + k * sum_step / 4] = sum_p[k];
                sqsum[loc_sq0 + k * sqsum_step / 4] = sqsum_p[k];
-            } 
+            }
            sum_p = (__local int*)(&(lm_sum[1][bf_loc]));
            sqsum_p = (__local float*)(&(lm_sqsum[1][bf_loc]));
            for(int k = 0; k < 4; k++)
@@ -264,7 +264,7 @@ kernel void integral_rows(__global int4 *srcsum,__global float4 * srcsqsum,__glo
                if(gid * 8 + 4 + k >= cols) break;
                sum[loc_s1 + k * sum_step / 4] = sum_p[k];
                sqsum[loc_sq1 + k * sqsum_step / 4] = sqsum_p[k];
-            } 
+            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
    }
--- a/modules/ocl/src/kernels/imgproc_integral_sum.cl
+++ b/modules/ocl/src/kernels/imgproc_integral_sum.cl
@@ -70,23 +70,23 @@ kernel void integral_sum_cols(__global uchar4 *src,__global int *sum ,
    {
        src_t[0] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + gid]) : 0);
        src_t[1] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + gid + 1]) : 0);
-        
+
        sum_t[0] =  (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
        sum_t[1] =  (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
        barrier(CLK_LOCAL_MEM_FENCE);
-        
+
        int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
        lm_sum[0][bf_loc] = src_t[0];

        lm_sum[1][bf_loc] = src_t[1];
-        
+
        int offset = 1;
        for(int d = LSIZE >> 1 ;  d > 0; d>>=1)
        {
            barrier(CLK_LOCAL_MEM_FENCE);
            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
-            ai += GET_CONFLICT_OFFSET(ai); 
-            bi += GET_CONFLICT_OFFSET(bi); 
+            ai += GET_CONFLICT_OFFSET(ai);
+            bi += GET_CONFLICT_OFFSET(bi);

            if((lid & 127) < d)
            {
@@ -94,7 +94,7 @@ kernel void integral_sum_cols(__global uchar4 *src,__global int *sum ,
            }
            offset <<= 1;
        }
-		barrier(CLK_LOCAL_MEM_FENCE);
+        barrier(CLK_LOCAL_MEM_FENCE);
        if(lid < 2)
        {
            lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
@@ -104,32 +104,32 @@ kernel void integral_sum_cols(__global uchar4 *src,__global int *sum ,
            barrier(CLK_LOCAL_MEM_FENCE);
            offset >>= 1;
            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
-            ai += GET_CONFLICT_OFFSET(ai); 
-            bi += GET_CONFLICT_OFFSET(bi); 
-            
+            ai += GET_CONFLICT_OFFSET(ai);
+            bi += GET_CONFLICT_OFFSET(bi);
+
            if((lid & 127) < d)
            {
                lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
                lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
            }
        }
-		barrier(CLK_LOCAL_MEM_FENCE);
+        barrier(CLK_LOCAL_MEM_FENCE);
        if(lid > 0 && (i+lid) <= rows){
            int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ;
-            lm_sum[0][bf_loc] += sum_t[0]; 
-            lm_sum[1][bf_loc] += sum_t[1]; 
+            lm_sum[0][bf_loc] += sum_t[0];
+            lm_sum[1][bf_loc] += sum_t[1];
            sum_p = (__local int*)(&(lm_sum[0][bf_loc]));
            for(int k = 0; k < 4; k++)
            {
                if(gid * 4 + k >= cols + pre_invalid || gid * 4 + k < pre_invalid) continue;
                sum[loc_s0 + k * dst_step / 4] = sum_p[k];
-            } 
+            }
            sum_p = (__local int*)(&(lm_sum[1][bf_loc]));
            for(int k = 0; k < 4; k++)
            {
                if(gid * 4 + k + 4 >= cols + pre_invalid) break;
                sum[loc_s1 + k * dst_step / 4] = sum_p[k];
-            } 
+            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
    }
@@ -150,23 +150,23 @@ kernel void integral_sum_rows(__global int4 *srcsum,__global int *sum ,
    {
        src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : 0;
        src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : 0;
-        
+
        sum_t[0] =  (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
        sum_t[1] =  (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
        barrier(CLK_LOCAL_MEM_FENCE);
-        
+
        int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
        lm_sum[0][bf_loc] = src_t[0];
-            
+
        lm_sum[1][bf_loc] = src_t[1];
-        
+
        int offset = 1;
        for(int d = LSIZE >> 1 ;  d > 0; d>>=1)
        {
            barrier(CLK_LOCAL_MEM_FENCE);
            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
-            ai += GET_CONFLICT_OFFSET(ai); 
-            bi += GET_CONFLICT_OFFSET(bi); 
+            ai += GET_CONFLICT_OFFSET(ai);
+            bi += GET_CONFLICT_OFFSET(bi);

            if((lid & 127) < d)
            {
@@ -174,7 +174,7 @@ kernel void integral_sum_rows(__global int4 *srcsum,__global int *sum ,
            }
            offset <<= 1;
        }
-		barrier(CLK_LOCAL_MEM_FENCE);
+        barrier(CLK_LOCAL_MEM_FENCE);
        if(lid < 2)
        {
            lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
@@ -184,9 +184,9 @@ kernel void integral_sum_rows(__global int4 *srcsum,__global int *sum ,
            barrier(CLK_LOCAL_MEM_FENCE);
            offset >>= 1;
            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
-            ai += GET_CONFLICT_OFFSET(ai); 
-            bi += GET_CONFLICT_OFFSET(bi); 
-            
+            ai += GET_CONFLICT_OFFSET(ai);
+            bi += GET_CONFLICT_OFFSET(bi);
+
            if((lid & 127) < d)
            {
                lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
@@ -201,13 +201,13 @@ kernel void integral_sum_rows(__global int4 *srcsum,__global int *sum ,
        if(i + lid == 0)
        {
            int loc0 = gid * 2 * sum_step;
-            for(int k = 1;k <= 8;k++) 
+            for(int k = 1;k <= 8;k++)
            {
                if(gid * 8 + k > cols) break;
                sum[sum_offset + loc0 + k * sum_step / 4] = 0;
            }
        }
-        
+
        if(lid > 0 && (i+lid) <= rows){
            int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ;
            lm_sum[0][bf_loc] += sum_t[0];
@@ -223,7 +223,7 @@ kernel void integral_sum_rows(__global int4 *srcsum,__global int *sum ,
            {
                if(gid * 8 + 4 + k >= cols) break;
                sum[loc_s1 + k * sum_step / 4] = sum_p[k];
-            } 
+            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
    }
--- a/modules/ocl/src/kernels/imgproc_median.cl
+++ b/modules/ocl/src/kernels/imgproc_median.cl
@@ -39,75 +39,75 @@
 __kernel void medianFilter_C1(__global uchar * src, __global uchar * dst,  int srcOffset, int dstOffset, int cols,
                                int rows, int srcStep, int dstStep, int m)
 {
-	int dx = get_global_id(0)-(m>>1);
+    int dx = get_global_id(0)-(m>>1);
    int dy = get_global_id(1)-(m>>1);
-    
-	short histom[256];
-	for(int i=0;i<256;++i)
-		histom[i]=0;

-	
-	for(int i=0;i<m;++i)
-	{	
-		__global uchar * data = src + srcOffset + mul24(srcStep,clamp(dy + (i), 0, rows-1));
-		for(int j=dx;j<dx+m;++j)
-		{
-			histom[data[clamp(j, 0, cols-1)]]++;
-		}
-	}
+    short histom[256];
+    for(int i=0;i<256;++i)
+        histom[i]=0;

-	int now=0;
-	int goal=(m*m+1)>>1;
-	int v;
-	for(int i=0;i<256;++i)
-	{
-		v=(now<goal?i:v);
-		now+=histom[i];
-	}
-	
-	if(dy<rows && dx<cols)
-		dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=v;
+
+    for(int i=0;i<m;++i)
+    {
+        __global uchar * data = src + srcOffset + mul24(srcStep,clamp(dy + (i), 0, rows-1));
+        for(int j=dx;j<dx+m;++j)
+        {
+            histom[data[clamp(j, 0, cols-1)]]++;
+        }
+    }
+
+    int now=0;
+    int goal=(m*m+1)>>1;
+    int v;
+    for(int i=0;i<256;++i)
+    {
+        v=(now<goal?i:v);
+        now+=histom[i];
+    }
+
+    if(dy<rows && dx<cols)
+        dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=v;
 }
 */
 #define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
 __kernel void medianFilter3_C4_D0(__global uchar4 * src, __global uchar4 * dst,  int srcOffset, int dstOffset, int cols,
                                int rows, int srcStep, int dstStep)
 {
-	
-	__local uchar4 data[18][18];
-	__global uchar4* source=src + srcOffset;

-	int dx = get_global_id(0) - get_local_id(0) -1;
+    __local uchar4 data[18][18];
+    __global uchar4* source=src + srcOffset;
+
+    int dx = get_global_id(0) - get_local_id(0) -1;
    int dy = get_global_id(1) - get_local_id(1) -1;
-    
-	const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 9*18-1);

-	int dr=id/18;
-	int dc=id%18;
-	int r=clamp(dy+dr, 0, rows-1);
-	int c=clamp(dx+dc, 0, cols-1);
+    const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 9*18-1);

-	data[dr][dc] = source[r*srcStep + c];
-	r=clamp(dy+dr+9, 0, rows-1);
-	data[dr+9][dc] = source[r*srcStep + c];
+    int dr=id/18;
+    int dc=id%18;
+    int r=clamp(dy+dr, 0, rows-1);
+    int c=clamp(dx+dc, 0, cols-1);

-	barrier(CLK_LOCAL_MEM_FENCE);
+    data[dr][dc] = source[r*srcStep + c];
+    r=clamp(dy+dr+9, 0, rows-1);
+    data[dr+9][dc] = source[r*srcStep + c];

-	int x =get_local_id(0);
-	int y =get_local_id(1);
-	uchar4 p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2];
-	uchar4 p3=data[y+1][x], p4=data[y+1][x+1], p5=data[y+1][x+2];
-	uchar4 p6=data[y+2][x], p7=data[y+2][x+1], p8=data[y+2][x+2]; 
-	uchar4 mid;
+    barrier(CLK_LOCAL_MEM_FENCE);

-	op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1);
+    int x =get_local_id(0);
+    int y =get_local_id(1);
+    uchar4 p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2];
+    uchar4 p3=data[y+1][x], p4=data[y+1][x+1], p5=data[y+1][x+2];
+    uchar4 p6=data[y+2][x], p7=data[y+2][x+1], p8=data[y+2][x+2];
+    uchar4 mid;
+
+    op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1);
    op(p3, p4); op(p6, p7); op(p1, p2); op(p4, p5);
    op(p7, p8); op(p0, p3); op(p5, p8); op(p4, p7);
    op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7);
    op(p4, p2); op(p6, p4); op(p4, p2);
-	
-	if(get_global_id(1)<rows && get_global_id(0)<cols)
-		dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
+
+    if(get_global_id(1)<rows && get_global_id(0)<cols)
+        dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
 }
 #undef op(a,b)

@@ -115,41 +115,41 @@ __kernel void medianFilter3_C4_D0(__global uchar4 * src, __global uchar4 * dst,
 __kernel void medianFilter3_C1_D0(__global uchar * src, __global uchar * dst,  int srcOffset, int dstOffset, int cols,
                                int rows, int srcStep, int dstStep)
 {
-	
-	__local uchar data[18][18];
-	__global uchar* source=src + srcOffset;

-	int dx = get_global_id(0) - get_local_id(0) -1;
+    __local uchar data[18][18];
+    __global uchar* source=src + srcOffset;
+
+    int dx = get_global_id(0) - get_local_id(0) -1;
    int dy = get_global_id(1) - get_local_id(1) -1;
-    
-	const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 9*18-1);

-	int dr=id/18;
-	int dc=id%18;
-	int r=clamp(dy+dr, 0, rows-1);
-	int c=clamp(dx+dc, 0, cols-1);
+    const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 9*18-1);

-	data[dr][dc] = source[r*srcStep + c];
-	r=clamp(dy+dr+9, 0, rows-1);
-	data[dr+9][dc] = source[r*srcStep + c];
+    int dr=id/18;
+    int dc=id%18;
+    int r=clamp(dy+dr, 0, rows-1);
+    int c=clamp(dx+dc, 0, cols-1);

-	barrier(CLK_LOCAL_MEM_FENCE);
+    data[dr][dc] = source[r*srcStep + c];
+    r=clamp(dy+dr+9, 0, rows-1);
+    data[dr+9][dc] = source[r*srcStep + c];

-	int x =get_local_id(0);
-	int y =get_local_id(1);
-	uchar p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2];
-	uchar p3=data[y+1][x], p4=data[y+1][x+1], p5=data[y+1][x+2];
-	uchar p6=data[y+2][x], p7=data[y+2][x+1], p8=data[y+2][x+2]; 
-	uchar mid;
+    barrier(CLK_LOCAL_MEM_FENCE);

-	op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1);
+    int x =get_local_id(0);
+    int y =get_local_id(1);
+    uchar p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2];
+    uchar p3=data[y+1][x], p4=data[y+1][x+1], p5=data[y+1][x+2];
+    uchar p6=data[y+2][x], p7=data[y+2][x+1], p8=data[y+2][x+2];
+    uchar mid;
+
+    op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1);
    op(p3, p4); op(p6, p7); op(p1, p2); op(p4, p5);
    op(p7, p8); op(p0, p3); op(p5, p8); op(p4, p7);
    op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7);
    op(p4, p2); op(p6, p4); op(p4, p2);
-	
-	if(get_global_id(1)<rows && get_global_id(0)<cols)
-		dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
+
+    if(get_global_id(1)<rows && get_global_id(0)<cols)
+        dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
 }
 #undef op(a,b)

@@ -157,41 +157,41 @@ __kernel void medianFilter3_C1_D0(__global uchar * src, __global uchar * dst,  i
 __kernel void medianFilter3_C1_D5(__global float * src, __global float * dst,  int srcOffset, int dstOffset, int cols,
                                int rows, int srcStep, int dstStep)
 {
-	
-	__local float data[18][18];
-	__global float* source=src + srcOffset;

-	int dx = get_global_id(0) - get_local_id(0) -1;
+    __local float data[18][18];
+    __global float* source=src + srcOffset;
+
+    int dx = get_global_id(0) - get_local_id(0) -1;
    int dy = get_global_id(1) - get_local_id(1) -1;
-    
-	const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 9*18-1);

-	int dr=id/18;
-	int dc=id%18;
-	int r=clamp(dy+dr, 0, rows-1);
-	int c=clamp(dx+dc, 0, cols-1);
+    const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 9*18-1);

-	data[dr][dc] = source[r*srcStep + c];
-	r=clamp(dy+dr+9, 0, rows-1);
-	data[dr+9][dc] = source[r*srcStep + c];
+    int dr=id/18;
+    int dc=id%18;
+    int r=clamp(dy+dr, 0, rows-1);
+    int c=clamp(dx+dc, 0, cols-1);

-	barrier(CLK_LOCAL_MEM_FENCE);
+    data[dr][dc] = source[r*srcStep + c];
+    r=clamp(dy+dr+9, 0, rows-1);
+    data[dr+9][dc] = source[r*srcStep + c];

-	int x =get_local_id(0);
-	int y =get_local_id(1);
-	float p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2];
-	float p3=data[y+1][x], p4=data[y+1][x+1], p5=data[y+1][x+2];
-	float p6=data[y+2][x], p7=data[y+2][x+1], p8=data[y+2][x+2]; 
-	float mid;
+    barrier(CLK_LOCAL_MEM_FENCE);

-	op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1);
+    int x =get_local_id(0);
+    int y =get_local_id(1);
+    float p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2];
+    float p3=data[y+1][x], p4=data[y+1][x+1], p5=data[y+1][x+2];
+    float p6=data[y+2][x], p7=data[y+2][x+1], p8=data[y+2][x+2];
+    float mid;
+
+    op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1);
    op(p3, p4); op(p6, p7); op(p1, p2); op(p4, p5);
    op(p7, p8); op(p0, p3); op(p5, p8); op(p4, p7);
    op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7);
    op(p4, p2); op(p6, p4); op(p4, p2);
-	
-	if(get_global_id(1)<rows && get_global_id(0)<cols)
-		dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
+
+    if(get_global_id(1)<rows && get_global_id(0)<cols)
+        dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
 }
 #undef op(a,b)

@@ -199,41 +199,41 @@ __kernel void medianFilter3_C1_D5(__global float * src, __global float * dst,  i
 __kernel void medianFilter3_C4_D5(__global float4 * src, __global float4 * dst,  int srcOffset, int dstOffset, int cols,
                                int rows, int srcStep, int dstStep)
 {
-	
-	__local float4 data[18][18];
-	__global float4* source=src + srcOffset;

-	int dx = get_global_id(0) - get_local_id(0) -1;
+    __local float4 data[18][18];
+    __global float4* source=src + srcOffset;
+
+    int dx = get_global_id(0) - get_local_id(0) -1;
    int dy = get_global_id(1) - get_local_id(1) -1;
-    
-	const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 9*18-1);

-	int dr=id/18;
-	int dc=id%18;
-	int r=clamp(dy+dr, 0, rows-1);
-	int c=clamp(dx+dc, 0, cols-1);
+    const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 9*18-1);

-	data[dr][dc] = source[r*srcStep + c];
-	r=clamp(dy+dr+9, 0, rows-1);
-	data[dr+9][dc] = source[r*srcStep + c];
+    int dr=id/18;
+    int dc=id%18;
+    int r=clamp(dy+dr, 0, rows-1);
+    int c=clamp(dx+dc, 0, cols-1);

-	barrier(CLK_LOCAL_MEM_FENCE);
+    data[dr][dc] = source[r*srcStep + c];
+    r=clamp(dy+dr+9, 0, rows-1);
+    data[dr+9][dc] = source[r*srcStep + c];

-	int x =get_local_id(0);
-	int y =get_local_id(1);
-	float4 p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2];
-	float4 p3=data[y+1][x], p4=data[y+1][x+1], p5=data[y+1][x+2];
-	float4 p6=data[y+2][x], p7=data[y+2][x+1], p8=data[y+2][x+2]; 
-	float4 mid;
+    barrier(CLK_LOCAL_MEM_FENCE);

-	op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1);
+    int x =get_local_id(0);
+    int y =get_local_id(1);
+    float4 p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2];
+    float4 p3=data[y+1][x], p4=data[y+1][x+1], p5=data[y+1][x+2];
+    float4 p6=data[y+2][x], p7=data[y+2][x+1], p8=data[y+2][x+2];
+    float4 mid;
+
+    op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1);
    op(p3, p4); op(p6, p7); op(p1, p2); op(p4, p5);
    op(p7, p8); op(p0, p3); op(p5, p8); op(p4, p7);
    op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7);
    op(p4, p2); op(p6, p4); op(p4, p2);
-	
-	if(get_global_id(1)<rows && get_global_id(0)<cols)
-		dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
+
+    if(get_global_id(1)<rows && get_global_id(0)<cols)
+        dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
 }
 #undef op(a,b)

@@ -241,36 +241,36 @@ __kernel void medianFilter3_C4_D5(__global float4 * src, __global float4 * dst,
 __kernel void medianFilter5_C4_D0(__global uchar4 * src, __global uchar4 * dst,  int srcOffset, int dstOffset, int cols,
                                int rows, int srcStep, int dstStep)
 {
-	
-	__local uchar4 data[20][20];
-	__global uchar4* source=src + srcOffset;

-	int dx = get_global_id(0) - get_local_id(0) -2;
+    __local uchar4 data[20][20];
+    __global uchar4* source=src + srcOffset;
+
+    int dx = get_global_id(0) - get_local_id(0) -2;
    int dy = get_global_id(1) - get_local_id(1) -2;
-    
-	const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 10*20-1);

-	int dr=id/20;
-	int dc=id%20;
-	int r=clamp(dy+dr, 0, rows-1);
-	int c=clamp(dx+dc, 0, cols-1);
+    const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 10*20-1);

-	data[dr][dc] = source[r*srcStep + c];
-	r=clamp(dy+dr+10, 0, rows-1);
-	data[dr+10][dc] = source[r*srcStep + c];
+    int dr=id/20;
+    int dc=id%20;
+    int r=clamp(dy+dr, 0, rows-1);
+    int c=clamp(dx+dc, 0, cols-1);

-	barrier(CLK_LOCAL_MEM_FENCE);
+    data[dr][dc] = source[r*srcStep + c];
+    r=clamp(dy+dr+10, 0, rows-1);
+    data[dr+10][dc] = source[r*srcStep + c];

-	int x =get_local_id(0);
-	int y =get_local_id(1);
-	uchar4 p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2], p3=data[y][x+3], p4=data[y][x+4];
-	uchar4 p5=data[y+1][x], p6=data[y+1][x+1], p7=data[y+1][x+2], p8=data[y+1][x+3], p9=data[y+1][x+4];
-	uchar4 p10=data[y+2][x], p11=data[y+2][x+1], p12=data[y+2][x+2], p13=data[y+2][x+3], p14=data[y+2][x+4];
-	uchar4 p15=data[y+3][x], p16=data[y+3][x+1], p17=data[y+3][x+2], p18=data[y+3][x+3], p19=data[y+3][x+4];
-	uchar4 p20=data[y+4][x], p21=data[y+4][x+1], p22=data[y+4][x+2], p23=data[y+4][x+3], p24=data[y+4][x+4];
-	uchar4 mid;
+    barrier(CLK_LOCAL_MEM_FENCE);

-	op(p1, p2); op(p0, p1); op(p1, p2); op(p4, p5); op(p3, p4);
+    int x =get_local_id(0);
+    int y =get_local_id(1);
+    uchar4 p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2], p3=data[y][x+3], p4=data[y][x+4];
+    uchar4 p5=data[y+1][x], p6=data[y+1][x+1], p7=data[y+1][x+2], p8=data[y+1][x+3], p9=data[y+1][x+4];
+    uchar4 p10=data[y+2][x], p11=data[y+2][x+1], p12=data[y+2][x+2], p13=data[y+2][x+3], p14=data[y+2][x+4];
+    uchar4 p15=data[y+3][x], p16=data[y+3][x+1], p17=data[y+3][x+2], p18=data[y+3][x+3], p19=data[y+3][x+4];
+    uchar4 p20=data[y+4][x], p21=data[y+4][x+1], p22=data[y+4][x+2], p23=data[y+4][x+3], p24=data[y+4][x+4];
+    uchar4 mid;
+
+    op(p1, p2); op(p0, p1); op(p1, p2); op(p4, p5); op(p3, p4);
    op(p4, p5); op(p0, p3); op(p2, p5); op(p2, p3); op(p1, p4);
    op(p1, p2); op(p3, p4); op(p7, p8); op(p6, p7); op(p7, p8);
    op(p10, p11); op(p9, p10); op(p10, p11); op(p6, p9); op(p8, p11);
@@ -293,9 +293,9 @@ __kernel void medianFilter5_C4_D0(__global uchar4 * src, __global uchar4 * dst,
    op(p10, p12); op(p1, p13); op(p9, p21); op(p9, p13); op(p5, p17);
    op(p13, p17); op(p3, p15); op(p11, p23); op(p11, p15); op(p7, p19);
    op(p7, p11); op(p11, p13); op(p11, p12);
-	
-	if(get_global_id(1)<rows && get_global_id(0)<cols)
-		dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
+
+    if(get_global_id(1)<rows && get_global_id(0)<cols)
+        dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
 }
 #undef op(a,b)

@@ -303,36 +303,36 @@ __kernel void medianFilter5_C4_D0(__global uchar4 * src, __global uchar4 * dst,
 __kernel void medianFilter5_C1_D0(__global uchar * src, __global uchar * dst,  int srcOffset, int dstOffset, int cols,
                                int rows, int srcStep, int dstStep)
 {
-	
-	__local uchar data[20][20];
-	__global uchar* source=src + srcOffset;

-	int dx = get_global_id(0) - get_local_id(0) -2;
+    __local uchar data[20][20];
+    __global uchar* source=src + srcOffset;
+
+    int dx = get_global_id(0) - get_local_id(0) -2;
    int dy = get_global_id(1) - get_local_id(1) -2;
-    
-	const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 10*20-1);

-	int dr=id/20;
-	int dc=id%20;
-	int r=clamp(dy+dr, 0, rows-1);
-	int c=clamp(dx+dc, 0, cols-1);
+    const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 10*20-1);

-	data[dr][dc] = source[r*srcStep + c];
-	r=clamp(dy+dr+10, 0, rows-1);
-	data[dr+10][dc] = source[r*srcStep + c];
+    int dr=id/20;
+    int dc=id%20;
+    int r=clamp(dy+dr, 0, rows-1);
+    int c=clamp(dx+dc, 0, cols-1);

-	barrier(CLK_LOCAL_MEM_FENCE);
+    data[dr][dc] = source[r*srcStep + c];
+    r=clamp(dy+dr+10, 0, rows-1);
+    data[dr+10][dc] = source[r*srcStep + c];

-	int x =get_local_id(0);
-	int y =get_local_id(1);
-	uchar p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2], p3=data[y][x+3], p4=data[y][x+4];
-	uchar p5=data[y+1][x], p6=data[y+1][x+1], p7=data[y+1][x+2], p8=data[y+1][x+3], p9=data[y+1][x+4];
-	uchar p10=data[y+2][x], p11=data[y+2][x+1], p12=data[y+2][x+2], p13=data[y+2][x+3], p14=data[y+2][x+4];
-	uchar p15=data[y+3][x], p16=data[y+3][x+1], p17=data[y+3][x+2], p18=data[y+3][x+3], p19=data[y+3][x+4];
-	uchar p20=data[y+4][x], p21=data[y+4][x+1], p22=data[y+4][x+2], p23=data[y+4][x+3], p24=data[y+4][x+4];
-	uchar mid;
+    barrier(CLK_LOCAL_MEM_FENCE);

-	op(p1, p2); op(p0, p1); op(p1, p2); op(p4, p5); op(p3, p4);
+    int x =get_local_id(0);
+    int y =get_local_id(1);
+    uchar p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2], p3=data[y][x+3], p4=data[y][x+4];
+    uchar p5=data[y+1][x], p6=data[y+1][x+1], p7=data[y+1][x+2], p8=data[y+1][x+3], p9=data[y+1][x+4];
+    uchar p10=data[y+2][x], p11=data[y+2][x+1], p12=data[y+2][x+2], p13=data[y+2][x+3], p14=data[y+2][x+4];
+    uchar p15=data[y+3][x], p16=data[y+3][x+1], p17=data[y+3][x+2], p18=data[y+3][x+3], p19=data[y+3][x+4];
+    uchar p20=data[y+4][x], p21=data[y+4][x+1], p22=data[y+4][x+2], p23=data[y+4][x+3], p24=data[y+4][x+4];
+    uchar mid;
+
+    op(p1, p2); op(p0, p1); op(p1, p2); op(p4, p5); op(p3, p4);
    op(p4, p5); op(p0, p3); op(p2, p5); op(p2, p3); op(p1, p4);
    op(p1, p2); op(p3, p4); op(p7, p8); op(p6, p7); op(p7, p8);
    op(p10, p11); op(p9, p10); op(p10, p11); op(p6, p9); op(p8, p11);
@@ -355,9 +355,9 @@ __kernel void medianFilter5_C1_D0(__global uchar * src, __global uchar * dst,  i
    op(p10, p12); op(p1, p13); op(p9, p21); op(p9, p13); op(p5, p17);
    op(p13, p17); op(p3, p15); op(p11, p23); op(p11, p15); op(p7, p19);
    op(p7, p11); op(p11, p13); op(p11, p12);
-	
-	if(get_global_id(1)<rows && get_global_id(0)<cols)
-		dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
+
+    if(get_global_id(1)<rows && get_global_id(0)<cols)
+        dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
 }
 #undef op(a,b)

@@ -365,36 +365,36 @@ __kernel void medianFilter5_C1_D0(__global uchar * src, __global uchar * dst,  i
 __kernel void medianFilter5_C4_D5(__global float4 * src, __global float4 * dst,  int srcOffset, int dstOffset, int cols,
                                int rows, int srcStep, int dstStep)
 {
-	
-	__local float4 data[20][20];
-	__global float4* source=src + srcOffset;

-	int dx = get_global_id(0) - get_local_id(0) -2;
+    __local float4 data[20][20];
+    __global float4* source=src + srcOffset;
+
+    int dx = get_global_id(0) - get_local_id(0) -2;
    int dy = get_global_id(1) - get_local_id(1) -2;
-    
-	const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 10*20-1);

-	int dr=id/20;
-	int dc=id%20;
-	int r=clamp(dy+dr, 0, rows-1);
-	int c=clamp(dx+dc, 0, cols-1);
+    const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 10*20-1);

-	data[dr][dc] = source[r*srcStep + c];
-	r=clamp(dy+dr+10, 0, rows-1);
-	data[dr+10][dc] = source[r*srcStep + c];
+    int dr=id/20;
+    int dc=id%20;
+    int r=clamp(dy+dr, 0, rows-1);
+    int c=clamp(dx+dc, 0, cols-1);

-	barrier(CLK_LOCAL_MEM_FENCE);
+    data[dr][dc] = source[r*srcStep + c];
+    r=clamp(dy+dr+10, 0, rows-1);
+    data[dr+10][dc] = source[r*srcStep + c];

-	int x =get_local_id(0);
-	int y =get_local_id(1);
-	float4 p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2], p3=data[y][x+3], p4=data[y][x+4];
-	float4 p5=data[y+1][x], p6=data[y+1][x+1], p7=data[y+1][x+2], p8=data[y+1][x+3], p9=data[y+1][x+4];
-	float4 p10=data[y+2][x], p11=data[y+2][x+1], p12=data[y+2][x+2], p13=data[y+2][x+3], p14=data[y+2][x+4];
-	float4 p15=data[y+3][x], p16=data[y+3][x+1], p17=data[y+3][x+2], p18=data[y+3][x+3], p19=data[y+3][x+4];
-	float4 p20=data[y+4][x], p21=data[y+4][x+1], p22=data[y+4][x+2], p23=data[y+4][x+3], p24=data[y+4][x+4];
-	float4 mid;
+    barrier(CLK_LOCAL_MEM_FENCE);

-	op(p1, p2); op(p0, p1); op(p1, p2); op(p4, p5); op(p3, p4);
+    int x =get_local_id(0);
+    int y =get_local_id(1);
+    float4 p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2], p3=data[y][x+3], p4=data[y][x+4];
+    float4 p5=data[y+1][x], p6=data[y+1][x+1], p7=data[y+1][x+2], p8=data[y+1][x+3], p9=data[y+1][x+4];
+    float4 p10=data[y+2][x], p11=data[y+2][x+1], p12=data[y+2][x+2], p13=data[y+2][x+3], p14=data[y+2][x+4];
+    float4 p15=data[y+3][x], p16=data[y+3][x+1], p17=data[y+3][x+2], p18=data[y+3][x+3], p19=data[y+3][x+4];
+    float4 p20=data[y+4][x], p21=data[y+4][x+1], p22=data[y+4][x+2], p23=data[y+4][x+3], p24=data[y+4][x+4];
+    float4 mid;
+
+    op(p1, p2); op(p0, p1); op(p1, p2); op(p4, p5); op(p3, p4);
    op(p4, p5); op(p0, p3); op(p2, p5); op(p2, p3); op(p1, p4);
    op(p1, p2); op(p3, p4); op(p7, p8); op(p6, p7); op(p7, p8);
    op(p10, p11); op(p9, p10); op(p10, p11); op(p6, p9); op(p8, p11);
@@ -417,9 +417,9 @@ __kernel void medianFilter5_C4_D5(__global float4 * src, __global float4 * dst,
    op(p10, p12); op(p1, p13); op(p9, p21); op(p9, p13); op(p5, p17);
    op(p13, p17); op(p3, p15); op(p11, p23); op(p11, p15); op(p7, p19);
    op(p7, p11); op(p11, p13); op(p11, p12);
-	
-	if(get_global_id(1)<rows && get_global_id(0)<cols)
-		dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
+
+    if(get_global_id(1)<rows && get_global_id(0)<cols)
+        dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
 }
 #undef op(a,b)

@@ -427,36 +427,36 @@ __kernel void medianFilter5_C4_D5(__global float4 * src, __global float4 * dst,
 __kernel void medianFilter5_C1_D5(__global float * src, __global float * dst,  int srcOffset, int dstOffset, int cols,
                                int rows, int srcStep, int dstStep)
 {
-	
-	__local float data[20][20];
-	__global float* source=src + srcOffset;

-	int dx = get_global_id(0) - get_local_id(0) -2;
+    __local float data[20][20];
+    __global float* source=src + srcOffset;
+
+    int dx = get_global_id(0) - get_local_id(0) -2;
    int dy = get_global_id(1) - get_local_id(1) -2;
-    
-	const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 10*20-1);

-	int dr=id/20;
-	int dc=id%20;
-	int r=clamp(dy+dr, 0, rows-1);
-	int c=clamp(dx+dc, 0, cols-1);
+    const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 10*20-1);

-	data[dr][dc] = source[r*srcStep + c];
-	r=clamp(dy+dr+10, 0, rows-1);
-	data[dr+10][dc] = source[r*srcStep + c];
+    int dr=id/20;
+    int dc=id%20;
+    int r=clamp(dy+dr, 0, rows-1);
+    int c=clamp(dx+dc, 0, cols-1);

-	barrier(CLK_LOCAL_MEM_FENCE);
+    data[dr][dc] = source[r*srcStep + c];
+    r=clamp(dy+dr+10, 0, rows-1);
+    data[dr+10][dc] = source[r*srcStep + c];

-	int x =get_local_id(0);
-	int y =get_local_id(1);
-	float p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2], p3=data[y][x+3], p4=data[y][x+4];
-	float p5=data[y+1][x], p6=data[y+1][x+1], p7=data[y+1][x+2], p8=data[y+1][x+3], p9=data[y+1][x+4];
-	float p10=data[y+2][x], p11=data[y+2][x+1], p12=data[y+2][x+2], p13=data[y+2][x+3], p14=data[y+2][x+4];
-	float p15=data[y+3][x], p16=data[y+3][x+1], p17=data[y+3][x+2], p18=data[y+3][x+3], p19=data[y+3][x+4];
-	float p20=data[y+4][x], p21=data[y+4][x+1], p22=data[y+4][x+2], p23=data[y+4][x+3], p24=data[y+4][x+4];
-	float mid;
+    barrier(CLK_LOCAL_MEM_FENCE);

-	op(p1, p2); op(p0, p1); op(p1, p2); op(p4, p5); op(p3, p4);
+    int x =get_local_id(0);
+    int y =get_local_id(1);
+    float p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2], p3=data[y][x+3], p4=data[y][x+4];
+    float p5=data[y+1][x], p6=data[y+1][x+1], p7=data[y+1][x+2], p8=data[y+1][x+3], p9=data[y+1][x+4];
+    float p10=data[y+2][x], p11=data[y+2][x+1], p12=data[y+2][x+2], p13=data[y+2][x+3], p14=data[y+2][x+4];
+    float p15=data[y+3][x], p16=data[y+3][x+1], p17=data[y+3][x+2], p18=data[y+3][x+3], p19=data[y+3][x+4];
+    float p20=data[y+4][x], p21=data[y+4][x+1], p22=data[y+4][x+2], p23=data[y+4][x+3], p24=data[y+4][x+4];
+    float mid;
+
+    op(p1, p2); op(p0, p1); op(p1, p2); op(p4, p5); op(p3, p4);
    op(p4, p5); op(p0, p3); op(p2, p5); op(p2, p3); op(p1, p4);
    op(p1, p2); op(p3, p4); op(p7, p8); op(p6, p7); op(p7, p8);
    op(p10, p11); op(p9, p10); op(p10, p11); op(p6, p9); op(p8, p11);
@@ -479,9 +479,9 @@ __kernel void medianFilter5_C1_D5(__global float * src, __global float * dst,  i
    op(p10, p12); op(p1, p13); op(p9, p21); op(p9, p13); op(p5, p17);
    op(p13, p17); op(p3, p15); op(p11, p23); op(p11, p15); op(p7, p19);
    op(p7, p11); op(p11, p13); op(p11, p12);
-	
-	if(get_global_id(1)<rows && get_global_id(0)<cols)
-		dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
+
+    if(get_global_id(1)<rows && get_global_id(0)<cols)
+        dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
 }
 #undef op(a,b)

--- a/modules/ocl/src/kernels/imgproc_remap.cl
+++ b/modules/ocl/src/kernels/imgproc_remap.cl
@@ -48,7 +48,7 @@
 #if defined DOUBLE_SUPPORT
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
 typedef double4 F4 ;
-#else 
+#else
 typedef float4 F4;
 #endif

@@ -62,7 +62,7 @@ __kernel void remapNNSConstant_C1_D0(__global unsigned char* dst, __global unsig
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
-     
+
    if(x < threadCols && y < dst_rows)
    {
        x = x << 2;
@@ -79,7 +79,7 @@ __kernel void remapNNSConstant_C1_D0(__global unsigned char* dst, __global unsig

        map1_data = *((__global short8 *)((__global char*)map1 + map1Start));
        int4 srcIdx = convert_int4(map1_data.odd) * src_step + convert_int4(map1_data.even) + src_offset;
-    
+
        uchar4 src_data;

        src_data.s0 = *(src + srcIdx.s0);
@@ -88,10 +88,10 @@ __kernel void remapNNSConstant_C1_D0(__global unsigned char* dst, __global unsig
        src_data.s3 = *(src + srcIdx.s3);
        uchar4 dst_data;
        dst_data = convert_uchar4((convert_int4(map1_data.even) >= (int4)(src_cols) || convert_int4(map1_data.odd) >= (int4)(src_rows)))? (uchar4)(val) : src_data;
- 
+
        __global uchar4* d = (__global uchar4 *)(dst + dstStart);

-        uchar4 dVal = *d;      
+        uchar4 dVal = *d;

        int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
        dst_data = (convert_uchar4(con) != convert_uchar4((int4)(0))) ? dst_data : dVal;
@@ -107,7 +107,7 @@ __kernel void remapNNFConstant_C1_D0(__global unsigned char* dst, __global unsig
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
-     
+
    if(x < threadCols && y < dst_rows)
    {
        x = x << 2;
@@ -125,7 +125,7 @@ __kernel void remapNNFConstant_C1_D0(__global unsigned char* dst, __global unsig
        map1_data = *((__global float8 *)((__global char*)map1 + map1Start));
        int8 map1_dataZ = convert_int8_sat_rte(map1_data);
        int4 srcIdx = map1_dataZ.odd * src_step + map1_dataZ.even + src_offset;
-    
+
        uchar4 src_data;

        src_data.s0 = *(src + srcIdx.s0);
@@ -136,10 +136,10 @@ __kernel void remapNNFConstant_C1_D0(__global unsigned char* dst, __global unsig
        dst_data = convert_uchar4(map1_dataZ.even >= (int4)(src_cols) || map1_dataZ.odd >= (int4)(src_rows)) ? (uchar4)(val) : src_data;
        __global uchar4* d = (__global uchar4 *)(dst + dstStart);

-        uchar4 dVal = *d;      
+        uchar4 dVal = *d;

        int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
-  
+
        dst_data = (convert_uchar4(con) != convert_uchar4((int4)(0))) ? dst_data : dVal;
        *d = dst_data;

@@ -152,7 +152,7 @@ __kernel void remapNNF1Constant_C1_D0(__global unsigned char* dst, __global unsi
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
-     
+
    if(x < threadCols && y < dst_rows)
    {
        x = x << 2;
@@ -173,7 +173,7 @@ __kernel void remapNNF1Constant_C1_D0(__global unsigned char* dst, __global unsi
        float8 map_data = (float8)(map1_data.s0, map2_data.s0, map1_data.s1, map2_data.s1, map1_data.s2, map2_data.s2, map1_data.s3, map2_data.s3);
        int8 map_dataZ = convert_int8_sat_rte(map_data);
        int4 srcIdx = map_dataZ.odd * src_step + map_dataZ.even + src_offset;
-    
+
        uchar4 src_data;

        src_data.s0 = *(src + srcIdx.s0);
@@ -184,10 +184,10 @@ __kernel void remapNNF1Constant_C1_D0(__global unsigned char* dst, __global unsi
        dst_data = convert_uchar4(map_dataZ.even >= (int4)(src_cols) || map_dataZ.odd >= (int4)(src_rows)) ? (uchar4)(val) : src_data;
        __global uchar4* d = (__global uchar4 *)(dst + dstStart);

-        uchar4 dVal = *d;      
+        uchar4 dVal = *d;

        int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
-  
+
        dst_data = (convert_uchar4(con) != convert_uchar4((int4)(0))) ? dst_data : dVal;
        *d = dst_data;
    }
@@ -230,7 +230,7 @@ __kernel void remapNNSConstant_C4_D0(__global unsigned char* dst, __global unsig
        dst_data = (uchar16)(dst_a, dst_b, dst_c, dst_d);
        __global uchar16* d = (__global uchar16 *)(dst + dstStart);

-        uchar16 dVal = *d;      
+        uchar16 dVal = *d;

        int16 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
        dst_data = (convert_uchar16(con) != ((uchar16)(0))) ? dst_data : dVal;
@@ -279,7 +279,7 @@ __kernel void remapNNFConstant_C4_D0(__global unsigned char* dst, __global unsig
        dst_data = (uchar16)(dst_a, dst_b, dst_c, dst_d);
        __global uchar16* d = (__global uchar16 *)(dst + dstStart);

-        uchar16 dVal = *d;      
+        uchar16 dVal = *d;

        int16 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
        dst_data = (convert_uchar16(con) != ((uchar16)(0))) ? dst_data : dVal;
@@ -333,7 +333,7 @@ __kernel void remapNNF1Constant_C4_D0(__global unsigned char* dst, __global unsi
        dst_data = (uchar16)(dst_a, dst_b, dst_c, dst_d);
        __global uchar16* d = (__global uchar16 *)(dst + dstStart);

-        uchar16 dVal = *d;      
+        uchar16 dVal = *d;

        int16 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
        dst_data = (convert_uchar16(con) != ((uchar16)(0))) ? dst_data : dVal;
@@ -351,9 +351,9 @@ __kernel void remapNNSConstant_C1_D5(__global float* dst, __global float const *
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
-     
+
    if(x < threadCols && y < dst_rows)
-    {     
+    {
        x = x << 4;

        int gx = x - (dst_offset&15);
@@ -368,25 +368,25 @@ __kernel void remapNNSConstant_C1_D5(__global float* dst, __global float const *
        short8 map1_data;

        map1_data = *((__global short8 *)((__global char*)map1 + map1Start));
-    
+
        int4 srcIdx = convert_int4(map1_data.odd) * src_step + (convert_int4(map1_data.even) <<((int4)(2))) + src_offset;
-    
+
        float4 src_data;
        src_data.s0 = *((__global float *)((__global char*)src + srcIdx.s0));
        src_data.s1 = *((__global float *)((__global char*)src + srcIdx.s1));
        src_data.s2 = *((__global float *)((__global char*)src + srcIdx.s2));
        src_data.s3 = *((__global float *)((__global char*)src + srcIdx.s3));
        float4 dst_data;
-        
+
        dst_data.s0 = (map1_data.s0 >= src_cols || map1_data.s1 >= src_rows)? val : src_data.s0;
        dst_data.s1 = (map1_data.s2 >= src_cols || map1_data.s3 >= src_rows)? val : src_data.s1;
        dst_data.s2 = (map1_data.s4 >= src_cols || map1_data.s5 >= src_rows)? val : src_data.s2;
        dst_data.s3 = (map1_data.s6 >= src_cols || map1_data.s7 >= src_rows)? val : src_data.s3;
-        
-  
+
+
        __global float4* d = (__global float4 *)((__global uchar*)dst + dstStart);

-        float4 dVal = *d;      
+        float4 dVal = *d;

        int4 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
        dst_data = (convert_float4(con) != (float4)(0)) ? dst_data : dVal;
@@ -402,7 +402,7 @@ __kernel void remapNNFConstant_C1_D5(__global float* dst, __global float const *
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
-     
+
    if(x < threadCols && y < dst_rows)
    {
        x = x << 4;
@@ -422,23 +422,23 @@ __kernel void remapNNFConstant_C1_D5(__global float* dst, __global float const *
        int8 map1_dataZ = convert_int8_sat_rte(map1_data);

        int4 srcIdx = convert_int4(map1_dataZ.odd) * src_step + convert_int4(map1_dataZ.even <<(int4)(2)) + src_offset;
-    
+
        float4 src_data;
        src_data.s0 = *((__global float *)((__global char*)src + srcIdx.s0));
        src_data.s1 = *((__global float *)((__global char*)src + srcIdx.s1));
        src_data.s2 = *((__global float *)((__global char*)src + srcIdx.s2));
        src_data.s3 = *((__global float *)((__global char*)src + srcIdx.s3));
        float4 dst_data;
-        
+
        dst_data.s0 = (map1_dataZ.s0 >= src_cols || map1_dataZ.s1 >= src_rows)? val : src_data.s0;
        dst_data.s1 = (map1_dataZ.s2 >= src_cols || map1_dataZ.s3 >= src_rows)? val : src_data.s1;
        dst_data.s2 = (map1_dataZ.s4 >= src_cols || map1_dataZ.s5 >= src_rows)? val : src_data.s2;
        dst_data.s3 = (map1_dataZ.s6 >= src_cols || map1_dataZ.s7 >= src_rows)? val : src_data.s3;
-        
- 
+
+
        __global float4* d = (__global float4 *)((__global uchar*)dst + dstStart);

-        float4 dVal = *d;      
+        float4 dVal = *d;

        int4 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
        dst_data = (convert_float4(con) != (float4)(0)) ? dst_data : dVal;
@@ -455,7 +455,7 @@ __kernel void remapNNF1Constant_C1_D5(__global float* dst, __global float const
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
-     
+
    if(x < threadCols && y < dst_rows)
    {
        x = x << 4;
@@ -478,23 +478,23 @@ __kernel void remapNNF1Constant_C1_D5(__global float* dst, __global float const
        int8 map1_dataZ = convert_int8_sat_rte(map_data);

        int4 srcIdx = convert_int4(map1_dataZ.odd) * src_step + convert_int4(map1_dataZ.even <<(int4)(2)) + src_offset;
-    
+
        float4 src_data;
        src_data.s0 = *((__global float *)((__global char*)src + srcIdx.s0));
        src_data.s1 = *((__global float *)((__global char*)src + srcIdx.s1));
        src_data.s2 = *((__global float *)((__global char*)src + srcIdx.s2));
        src_data.s3 = *((__global float *)((__global char*)src + srcIdx.s3));
        float4 dst_data;
-        
+
        dst_data.s0 = (map1_dataZ.s0 >= src_cols || map1_dataZ.s1 >= src_rows)? val : src_data.s0;
        dst_data.s1 = (map1_dataZ.s2 >= src_cols || map1_dataZ.s3 >= src_rows)? val : src_data.s1;
        dst_data.s2 = (map1_dataZ.s4 >= src_cols || map1_dataZ.s5 >= src_rows)? val : src_data.s2;
        dst_data.s3 = (map1_dataZ.s6 >= src_cols || map1_dataZ.s7 >= src_rows)? val : src_data.s3;
-        
- 
+
+
        __global float4* d = (__global float4 *)((__global uchar*)dst + dstStart);

-        float4 dVal = *d;      
+        float4 dVal = *d;

        int4 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
        dst_data = (convert_float4(con) != (float4)(0)) ? dst_data : dVal;
@@ -577,13 +577,13 @@ __kernel void remapLNFConstant_C1_D0(__global unsigned char* dst, __global unsig
    int y = get_global_id(1);
    if(x < threadCols && y < dst_rows)
    {
-      x = x << 2; 
+      x = x << 2;
      int gx = x - (dst_offset&3);
      int4 Gx = (int4)(gx, gx+1, gx+2, gx+3);

      uchar4 nval =convert_uchar4(nVal);
      uchar val = nval.s0;
-  
+

      int dstStart = (y * dst_step + x  + dst_offset) - (dst_offset&3);

@@ -607,7 +607,7 @@ __kernel void remapLNFConstant_C1_D0(__global unsigned char* dst, __global unsig

      int4 src_StartU = map1_dataDy * src_step + map1_dataDx + src_offset;
      int4 src_StartD = src_StartU + src_step;
-     /* 
+     /*
      //not using the vload
      int4 src_StartU1 = src_StartU + (int4)(1);
      int4 src_StartD1 = src_StartD + (int4)(1);
@@ -617,7 +617,7 @@ __kernel void remapLNFConstant_C1_D0(__global unsigned char* dst, __global unsig
      a.y = *(src_StartU.y + src);
      a.z = *(src_StartU.z + src);
      a.w = *(src_StartU.w + src);
-    
+
      b.x = *(src_StartU1.x + src);
      b.y = *(src_StartU1.y + src);
      b.z = *(src_StartU1.z + src);
@@ -649,7 +649,7 @@ __kernel void remapLNFConstant_C1_D0(__global unsigned char* dst, __global unsig
      b = (uchar4)(aU.y, bU.y, cU.y, dU.y);
      c = (uchar4)(aD.x, bD.x, cD.x, dD.x);
      d = (uchar4)(aD.y, bD.y, cD.y, dD.y);
-      
+
      int4 ac =(map1_dataDx >= src_cols || map1_dataDy >= src_rows || map1_dataDy< 0 || map1_dataDy < 0);
      int4 bc =(map1_dataDx1 >= src_cols || map1_dataDy >= src_rows || map1_dataDx1 < 0 || map1_dataDy < 0);
      int4 cc =(map1_dataDx >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDx < 0);
@@ -660,10 +660,10 @@ __kernel void remapLNFConstant_C1_D0(__global unsigned char* dst, __global unsig
      d = (convert_uchar4(dc) == (uchar4)(0))? d : val;

      uchar4 dst_data = convert_uchar4_sat_rte((convert_float4(a))* ud * vd +(convert_float4(b))* u * vd + (convert_float4(c))* ud * v + (convert_float4(d)) * u * v );
-    
+
      __global uchar4* D = (__global uchar4 *)(dst + dstStart);

-      uchar4 dVal = *D;      
+      uchar4 dVal = *D;
      int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
      dst_data = (convert_uchar4(con) != (uchar4)(0)) ? dst_data : dVal;

@@ -680,13 +680,13 @@ __kernel void remapLNF1Constant_C1_D0(__global unsigned char* dst, __global unsi
    int y = get_global_id(1);
    if(x < threadCols && y < dst_rows)
    {
-      x = x << 2; 
+      x = x << 2;
      int gx = x - (dst_offset&3);
      int4 Gx = (int4)(gx, gx+1, gx+2, gx+3);

      uchar4 nval =convert_uchar4(nVal);
      uchar val = nval.s0;
-  
+

      int dstStart = (y * dst_step + x  + dst_offset) - (dst_offset&3);

@@ -713,7 +713,7 @@ __kernel void remapLNF1Constant_C1_D0(__global unsigned char* dst, __global unsi

      int4 src_StartU = map1_dataDy * src_step + map1_dataDx + src_offset;
      int4 src_StartD = src_StartU + src_step;
-     /* 
+     /*
      //not using the vload
      int4 src_StartU1 = src_StartU + (int4)(1);
      int4 src_StartD1 = src_StartD + (int4)(1);
@@ -723,7 +723,7 @@ __kernel void remapLNF1Constant_C1_D0(__global unsigned char* dst, __global unsi
      a.y = *(src_StartU.y + src);
      a.z = *(src_StartU.z + src);
      a.w = *(src_StartU.w + src);
-    
+
      b.x = *(src_StartU1.x + src);
      b.y = *(src_StartU1.y + src);
      b.z = *(src_StartU1.z + src);
@@ -755,7 +755,7 @@ __kernel void remapLNF1Constant_C1_D0(__global unsigned char* dst, __global unsi
      b = (uchar4)(aU.y, bU.y, cU.y, dU.y);
      c = (uchar4)(aD.x, bD.x, cD.x, dD.x);
      d = (uchar4)(aD.y, bD.y, cD.y, dD.y);
-      
+
      int4 ac =(map1_dataDx >= src_cols || map1_dataDy >= src_rows || map1_dataDy< 0 || map1_dataDy < 0);
      int4 bc =(map1_dataDx1 >= src_cols || map1_dataDy >= src_rows || map1_dataDx1 < 0 || map1_dataDy < 0);
      int4 cc =(map1_dataDx >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDx < 0);
@@ -766,10 +766,10 @@ __kernel void remapLNF1Constant_C1_D0(__global unsigned char* dst, __global unsi
      d = (convert_uchar4(dc) == (uchar4)(0))? d : val;

      uchar4 dst_data = convert_uchar4_sat_rte((convert_float4(a))* ud * vd +(convert_float4(b))* u * vd + (convert_float4(c))* ud * v + (convert_float4(d)) * u * v );
-    
+
      __global uchar4* D = (__global uchar4 *)(dst + dstStart);

-      uchar4 dVal = *D;      
+      uchar4 dVal = *D;
      int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
      dst_data = (convert_uchar4(con) != (uchar4)(0)) ? dst_data : dVal;

@@ -784,7 +784,7 @@ __kernel void remapLNSConstant_C1_D0(__global unsigned char* dst, __global unsig
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
-     
+
    if(x < threadCols && y < dst_rows)
    {
        x = x << 2;
@@ -801,7 +801,7 @@ __kernel void remapLNSConstant_C1_D0(__global unsigned char* dst, __global unsig

        map1_data = *((__global short8 *)((__global char*)map1 + map1Start));
        int4 srcIdx = convert_int4(map1_data.odd) * src_step + convert_int4(map1_data.even) + src_offset;
-    
+
        uchar4 src_data;

        src_data.s0 = *(src + srcIdx.s0);
@@ -810,10 +810,10 @@ __kernel void remapLNSConstant_C1_D0(__global unsigned char* dst, __global unsig
        src_data.s3 = *(src + srcIdx.s3);
        uchar4 dst_data;
        dst_data = convert_uchar4((convert_int4(map1_data.even) >= (int4)(src_cols) || convert_int4(map1_data.odd) >= (int4)(src_rows)))? (uchar4)(val) : src_data;
- 
+
        __global uchar4* d = (__global uchar4 *)(dst + dstStart);

-        uchar4 dVal = *d;      
+        uchar4 dVal = *d;

        int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
        dst_data = (convert_uchar4(con) != (uchar4)(0)) ? dst_data : dVal;
@@ -835,7 +835,7 @@ __kernel void remapLNFConstant_C4_D0(__global unsigned char* dst, __global unsig
    int y = get_global_id(1);
    if(x < threadCols && y < dst_rows)
    {
-      x = x << 4; 
+      x = x << 4;
      int gx = x - (dst_offset&15);
      int16 Gx = (int16)(gx, gx+1, gx+2, gx+3, gx+4, gx+5, gx+6, gx+7, gx+8, gx+9, gx+10, gx+11, gx+12, gx+13, gx+14, gx+15);

@@ -854,7 +854,7 @@ __kernel void remapLNFConstant_C4_D0(__global unsigned char* dst, __global unsig
      float4 v = temp.odd;
      float4 ud = (float4)(1.0) - u;
      float4 vd = (float4)(1.0) - v;
-      
+
      //float8 map1_dataU = map1_dataD + 1;

      int4 map1_dataDx = map1_dataD.even;
@@ -888,7 +888,7 @@ __kernel void remapLNFConstant_C4_D0(__global unsigned char* dst, __global unsig
      int16 bcc = (int16)((int4)(bc.x), (int4)(bc.y), (int4)(bc.z), (int4)(bc.w));
      int16 ccc = (int16)((int4)(cc.x), (int4)(cc.y), (int4)(cc.z), (int4)(cc.w));
      int16 dcc = (int16)((int4)(dc.x), (int4)(dc.y), (int4)(dc.z), (int4)(dc.w));
- 
+
      uchar16 val = (uchar16)(nval, nval, nval, nval);
      a = (convert_uchar16(acc) == (uchar16)(0))? a : val;
      b = (convert_uchar16(bcc) == (uchar16)(0))? b : val;
@@ -901,10 +901,10 @@ __kernel void remapLNFConstant_C4_D0(__global unsigned char* dst, __global unsig
      float16 Vd = (float16)((float4)(vd.x), (float4)(vd.y), (float4)(vd.z), (float4)(vd.w));

      uchar16 dst_data = convert_uchar16_sat_rte((convert_float16(a))* Ud * Vd +(convert_float16(b))* U * Vd + (convert_float16(c))* Ud * V + (convert_float16(d)) * U * V );
-    
+
      __global uchar16* D = (__global uchar16 *)(dst + dstStart);

-      uchar16 dVal = *D;      
+      uchar16 dVal = *D;
      int16 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
      dst_data = (convert_uchar16(con) != (uchar16)(0)) ? dst_data : dVal;

@@ -922,7 +922,7 @@ __kernel void remapLNF1Constant_C4_D0(__global unsigned char* dst, __global unsi
    int y = get_global_id(1);
    if(x < threadCols && y < dst_rows)
    {
-      x = x << 4; 
+      x = x << 4;
      int gx = x - (dst_offset&15);
      int16 Gx = (int16)(gx, gx+1, gx+2, gx+3, gx+4, gx+5, gx+6, gx+7, gx+8, gx+9, gx+10, gx+11, gx+12, gx+13, gx+14, gx+15);

@@ -944,7 +944,7 @@ __kernel void remapLNF1Constant_C4_D0(__global unsigned char* dst, __global unsi
      float4 v = temp.odd;
      float4 ud = (float4)(1.0) - u;
      float4 vd = (float4)(1.0) - v;
-      
+
      //float8 map1_dataU = map1_dataD + 1;

      int4 map1_dataDx = map1_dataD.even;
@@ -978,7 +978,7 @@ __kernel void remapLNF1Constant_C4_D0(__global unsigned char* dst, __global unsi
      int16 bcc = (int16)((int4)(bc.x), (int4)(bc.y), (int4)(bc.z), (int4)(bc.w));
      int16 ccc = (int16)((int4)(cc.x), (int4)(cc.y), (int4)(cc.z), (int4)(cc.w));
      int16 dcc = (int16)((int4)(dc.x), (int4)(dc.y), (int4)(dc.z), (int4)(dc.w));
- 
+
      uchar16 val = (uchar16)(nval, nval, nval, nval);
      a = (convert_uchar16(acc) == (uchar16)(0))? a : val;
      b = (convert_uchar16(bcc) == (uchar16)(0))? b : val;
@@ -991,10 +991,10 @@ __kernel void remapLNF1Constant_C4_D0(__global unsigned char* dst, __global unsi
      float16 Vd = (float16)((float4)(vd.x), (float4)(vd.y), (float4)(vd.z), (float4)(vd.w));

      uchar16 dst_data = convert_uchar16_sat_rte((convert_float16(a))* Ud * Vd +(convert_float16(b))* U * Vd + (convert_float16(c))* Ud * V + (convert_float16(d)) * U * V );
-    
+
      __global uchar16* D = (__global uchar16 *)(dst + dstStart);

-      uchar16 dVal = *D;      
+      uchar16 dVal = *D;
      int16 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
      dst_data = (convert_uchar16(con) != (uchar16)(0)) ? dst_data : dVal;

@@ -1039,7 +1039,7 @@ __kernel void remapLNSConstant_C4_D0(__global unsigned char* dst, __global unsig
        dst_data = (uchar16)(dst_a, dst_b, dst_c, dst_d);
        __global uchar16* d = (__global uchar16 *)(dst + dstStart);

-        uchar16 dVal = *d;      
+        uchar16 dVal = *d;

        int16 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
        dst_data = (convert_uchar16(con) != (uchar16)(0)) ? dst_data : dVal;
@@ -1059,13 +1059,13 @@ __kernel void remapLNFConstant_C1_D5(__global float* dst, __global float const *
    int y = get_global_id(1);
    if(x < threadCols && y < dst_rows)
    {
-      x = x << 4; 
+      x = x << 4;
      int gx = x - (dst_offset&15);
      int4 Gx = (int4)(gx, gx+4, gx+8, gx+12);

      float4 nval =convert_float4(nVal);
      float4 val = (float4)(nval.s0);
-  
+
      int dstStart = (y * dst_step + x  + dst_offset) - (dst_offset&15);
      int map1Start = y * map1_step + (x << 1) + map1_offset - ((dst_offset & 15) << 1);
      float8 map1_data;
@@ -1087,7 +1087,7 @@ __kernel void remapLNFConstant_C1_D5(__global float* dst, __global float const *

      int4 src_StartU = map1_dataDy * src_step + (map1_dataDx << (int4)(2)) + src_offset;
      int4 src_StartD = src_StartU + src_step;
-     /* 
+     /*
      //not using the vload
      int4 src_StartU1 = src_StartU + (int4)(1);
      int4 src_StartD1 = src_StartD + (int4)(1);
@@ -1097,7 +1097,7 @@ __kernel void remapLNFConstant_C1_D5(__global float* dst, __global float const *
      a.y = *(src_StartU.y + src);
      a.z = *(src_StartU.z + src);
      a.w = *(src_StartU.w + src);
-    
+
      b.x = *(src_StartU1.x + src);
      b.y = *(src_StartU1.y + src);
      b.z = *(src_StartU1.z + src);
@@ -1129,7 +1129,7 @@ __kernel void remapLNFConstant_C1_D5(__global float* dst, __global float const *
      b = (float4)(aU.y, bU.y, cU.y, dU.y);
      c = (float4)(aD.x, bD.x, cD.x, dD.x);
      d = (float4)(aD.y, bD.y, cD.y, dD.y);
-      
+
      int4 ac =(map1_dataDx >= (int4)(src_cols) || map1_dataDy >= (int4)(src_rows) || map1_dataDy < (int4)(0) || map1_dataDy < (int4)(0));
      int4 bc =(map1_dataDx1 >= (int4)(src_cols) || map1_dataDy >= (int4)(src_rows) || map1_dataDx1 < (int4)(0) || map1_dataDy < (int4)(0));
      int4 cc =(map1_dataDx >= (int4)(src_cols) || map1_dataDy1 >= (int4)(src_rows) || map1_dataDy1 < (int4)(0) || map1_dataDx < (int4)(0));
@@ -1140,10 +1140,10 @@ __kernel void remapLNFConstant_C1_D5(__global float* dst, __global float const *
      d = (convert_float4(dc) == (float4)(0))? d : val;

      float4 dst_data = a * ud * vd + b * u * vd + c * ud * v + d * u * v ;
-    
+
      __global float4* D = (__global float4 *)((__global char*)dst + dstStart);

-      float4 dVal = *D;      
+      float4 dVal = *D;
      int4 con = (Gx >= 0 && Gx < (dst_cols << 2) && y >= 0 && y < dst_rows);
      dst_data = (convert_float4(con) != (float4)(0)) ? dst_data : dVal;

@@ -1160,13 +1160,13 @@ __kernel void remapLNF1Constant_C1_D5(__global float* dst, __global float const
    int y = get_global_id(1);
    if(x < threadCols && y < dst_rows)
    {
-      x = x << 4; 
+      x = x << 4;
      int gx = x - (dst_offset&15);
      int4 Gx = (int4)(gx, gx+4, gx+8, gx+12);

      float4 nval =convert_float4(nVal);
      float4 val = (float4)(nval.s0);
-  
+
      int dstStart = y * dst_step + x  + dst_offset - (dst_offset & 15);
      int map1Start = y * map1_step + x + map1_offset - (dst_offset & 15);
      float4 map1_data;
@@ -1191,7 +1191,7 @@ __kernel void remapLNF1Constant_C1_D5(__global float* dst, __global float const

      int4 src_StartU = map1_dataDy * src_step + (map1_dataDx << (int4)(2)) + src_offset;
      int4 src_StartD = src_StartU + src_step;
-     /* 
+     /*
      //not using the vload
      int4 src_StartU1 = src_StartU + (int4)(1);
      int4 src_StartD1 = src_StartD + (int4)(1);
@@ -1201,7 +1201,7 @@ __kernel void remapLNF1Constant_C1_D5(__global float* dst, __global float const
      a.y = *(src_StartU.y + src);
      a.z = *(src_StartU.z + src);
      a.w = *(src_StartU.w + src);
-    
+
      b.x = *(src_StartU1.x + src);
      b.y = *(src_StartU1.y + src);
      b.z = *(src_StartU1.z + src);
@@ -1233,7 +1233,7 @@ __kernel void remapLNF1Constant_C1_D5(__global float* dst, __global float const
      b = (float4)(aU.y, bU.y, cU.y, dU.y);
      c = (float4)(aD.x, bD.x, cD.x, dD.x);
      d = (float4)(aD.y, bD.y, cD.y, dD.y);
-      
+
      int4 ac =(map1_dataDx >= (int4)(src_cols) || map1_dataDy >= (int4)(src_rows) || map1_dataDy < (int4)(0) || map1_dataDy < (int4)(0));
      int4 bc =(map1_dataDx1 >= (int4)(src_cols) || map1_dataDy >= (int4)(src_rows) || map1_dataDx1 < (int4)(0) || map1_dataDy < (int4)(0));
      int4 cc =(map1_dataDx >= (int4)(src_cols) || map1_dataDy1 >= (int4)(src_rows) || map1_dataDy1 < (int4)(0) || map1_dataDx < (int4)(0));
@@ -1244,10 +1244,10 @@ __kernel void remapLNF1Constant_C1_D5(__global float* dst, __global float const
      d = (convert_float4(dc) == (float4)(0))? d : val;

      float4 dst_data = a * ud * vd + b * u * vd + c * ud * v + d * u * v ;
-    
+
      __global float4* D = (__global float4 *)((__global char*)dst + dstStart);

-      float4 dVal = *D;      
+      float4 dVal = *D;
      int4 con = (Gx >= 0 && Gx < (dst_cols << 2) && y >= 0 && y < dst_rows);
      dst_data = (convert_float4(con) != (float4)(0)) ? dst_data : dVal;

@@ -1261,9 +1261,9 @@ __kernel void remapLNSConstant_C1_D5(__global float* dst, __global float const *
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
-     
+
    if(x < threadCols && y < dst_rows)
-    {     
+    {
        x = x << 4;

        int gx = x - (dst_offset&15);
@@ -1278,25 +1278,25 @@ __kernel void remapLNSConstant_C1_D5(__global float* dst, __global float const *
        short8 map1_data;

        map1_data = *((__global short8 *)((__global char*)map1 + map1Start));
-    
+
        int4 srcIdx = convert_int4(map1_data.odd) * src_step + (convert_int4(map1_data.even) << (int4)(2)) + src_offset;
-    
+
        float4 src_data;
        src_data.s0 = *((__global float *)((__global char*)src + srcIdx.s0));
        src_data.s1 = *((__global float *)((__global char*)src + srcIdx.s1));
        src_data.s2 = *((__global float *)((__global char*)src + srcIdx.s2));
        src_data.s3 = *((__global float *)((__global char*)src + srcIdx.s3));
        float4 dst_data;
-        
+
        dst_data.s0 = (map1_data.s0 >= src_cols || map1_data.s1 >= src_rows)? val : src_data.s0;
        dst_data.s1 = (map1_data.s2 >= src_cols || map1_data.s3 >= src_rows)? val : src_data.s1;
        dst_data.s2 = (map1_data.s4 >= src_cols || map1_data.s5 >= src_rows)? val : src_data.s2;
        dst_data.s3 = (map1_data.s6 >= src_cols || map1_data.s7 >= src_rows)? val : src_data.s3;
-        
-  
+
+
        __global float4* d = (__global float4 *)((__global uchar*)dst + dstStart);

-        float4 dVal = *d;      
+        float4 dVal = *d;

        int4 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
        dst_data = (convert_float4(con) != (float4)(0)) ? dst_data : dVal;
@@ -1348,7 +1348,7 @@ __kernel void remapLNFConstant_C4_D5(__global float * dst, __global float const
      c = (mX >= src_cols || mY1 >= src_rows ) ? nval : c;
      d = (mX1 >= src_cols || mY1 >= src_rows ) ? nval : d;

-      float4 dst_data = a * ud * vd + b * u * vd + c * ud * v + d * u * v; 
+      float4 dst_data = a * ud * vd + b * u * vd + c * ud * v + d * u * v;
      *((__global float4 *)((__global uchar*)dst + dstIdx)) =  a * ud * vd + b * u * vd + c * ud * v + d * u * v ;

    }
@@ -1395,7 +1395,7 @@ __kernel void remapLNF1Constant_C4_D5(__global float * dst, __global float const
      c = (mX >= src_cols || mY1 >= src_rows ) ? nval : c;
      d = (mX1 >= src_cols || mY1 >= src_rows ) ? nval : d;

-      float4 dst_data = a * ud * vd + b * u * vd + c * ud * v + d * u * v; 
+      float4 dst_data = a * ud * vd + b * u * vd + c * ud * v + d * u * v;
      *((__global float4 *)((__global uchar*)dst + dstIdx)) =  a * ud * vd + b * u * vd + c * ud * v + d * u * v ;

    }
@@ -1430,8 +1430,8 @@ __kernel void remapNNSConstant_C1_D0(__global unsigned char* dst, __read_only im
      short8 map1_data;

      map1_data = *((__global short8 *)((__global char*)map1 + map1Start));
-        
-      const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | 
+
+      const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |
          CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;

      int4 src_data;
@@ -1448,7 +1448,7 @@ __kernel void remapNNSConstant_C1_D0(__global unsigned char* dst, __read_only im
      int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
      dst_data = (convert_uchar4(con) != (uchar4)(0)) ? dst_data : dVal;

-      *d = dst_data;   
+      *d = dst_data;
    }
 }
 */
--- a/modules/ocl/src/kernels/imgproc_resize.cl
+++ b/modules/ocl/src/kernels/imgproc_resize.cl
@@ -44,14 +44,14 @@
 //M*/


-// resize kernel 
+// resize kernel
 // Currently, CV_8UC1  CV_8UC4  CV_32FC1 and CV_32FC4are supported.
 // We shall support other types later if necessary.

 #if defined DOUBLE_SUPPORT
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
 #define F double
-#else 
+#else
 #define F float
 #endif

@@ -63,12 +63,12 @@
 #define INC(x,l) ((x+1) >= (l) ? (x):((x)+1))

 __kernel void resizeLN_C1_D0(__global uchar * dst, __global uchar const * restrict src,
-                     int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel, 
+                     int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel,
                     int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify )
 {
    int gx = get_global_id(0);
    int dy = get_global_id(1);
-    
+
    float4  sx, u, xf;
    int4 x, DX;
    gx = (gx<<2) - (dstoffset_in_pixel&3);
@@ -80,15 +80,15 @@ __kernel void resizeLN_C1_D0(__global uchar * dst, __global uchar const * restri
    float sy = ((dy+0.5f) * ify - 0.5f);
    int y = floor(sy);
    float v = sy - y;
- 
+
    u = x < 0 ? 0 : u;
    u = (x >= src_cols) ? 0 : u;
    x = x < 0 ? 0 : x;
    x = (x >= src_cols) ? src_cols-1 : x;
- 
+
    y<0 ? y=0,v=0 : y;
    y>=src_rows ? y=src_rows-1,v=0 : y;
- 
+
    int4 U, U1;
    int V, V1;
    float4 utmp1, utmp2;
@@ -96,8 +96,8 @@ __kernel void resizeLN_C1_D0(__global uchar * dst, __global uchar const * restri
    float4 scale_vec = INTER_RESIZE_COEF_SCALE;
    utmp1 = u * scale_vec;
    utmp2 = scale_vec - utmp1;
-    U = convert_int4(rint(utmp1)); 
-    U1 = convert_int4(rint(utmp2)); 
+    U = convert_int4(rint(utmp1));
+    U1 = convert_int4(rint(utmp2));
    vtmp = v * INTER_RESIZE_COEF_SCALE;
    V = rint(vtmp);
    V1= rint(INTER_RESIZE_COEF_SCALE - vtmp);
@@ -137,42 +137,42 @@ __kernel void resizeLN_C1_D0(__global uchar * dst, __global uchar const * restri
    val1 = mul24(U1 , sdata1) + mul24(U , sdata2);
    val2 = mul24(U1 , sdata3) + mul24(U , sdata4);
    val = mul24((int4)V1 , val1) + mul24((int4)V , val2);
-    
+
    val = ((val + (1<<(CAST_BITS-1))) >> CAST_BITS);

-	pos4 = mad24(dy, dststep_in_pixel, gx+dstoffset_in_pixel);
-	pos4.y++;
-	pos4.z+=2;
-	pos4.w+=3;
-	uchar4 uval = convert_uchar4_sat(val);
+    pos4 = mad24(dy, dststep_in_pixel, gx+dstoffset_in_pixel);
+    pos4.y++;
+    pos4.z+=2;
+    pos4.w+=3;
+    uchar4 uval = convert_uchar4_sat(val);
        int con = (gx >= 0 && gx+3 < dst_cols && dy >= 0 && dy < dst_rows && (dstoffset_in_pixel&3)==0);
-	if(con)
-	{
-		*(__global uchar4*)(dst + pos4.x)=uval;
-	}
-	else
-	{
-		if(gx >= 0 && gx < dst_cols && dy >= 0 && dy < dst_rows)
-		{
-			dst[pos4.x]=uval.x;
-		}
-		if(gx+1 >= 0 && gx+1 < dst_cols && dy >= 0 && dy < dst_rows)
-		{
-			dst[pos4.y]=uval.y;
-		}
-		if(gx+2 >= 0 && gx+2 < dst_cols && dy >= 0 && dy < dst_rows)
-		{
-			dst[pos4.z]=uval.z;
-		}
-		if(gx+3 >= 0 && gx+3 < dst_cols && dy >= 0 && dy < dst_rows)
-		{
-			dst[pos4.w]=uval.w;
-		}
-	}
+    if(con)
+    {
+        *(__global uchar4*)(dst + pos4.x)=uval;
+    }
+    else
+    {
+        if(gx >= 0 && gx < dst_cols && dy >= 0 && dy < dst_rows)
+        {
+            dst[pos4.x]=uval.x;
+        }
+        if(gx+1 >= 0 && gx+1 < dst_cols && dy >= 0 && dy < dst_rows)
+        {
+            dst[pos4.y]=uval.y;
+        }
+        if(gx+2 >= 0 && gx+2 < dst_cols && dy >= 0 && dy < dst_rows)
+        {
+            dst[pos4.z]=uval.z;
+        }
+        if(gx+3 >= 0 && gx+3 < dst_cols && dy >= 0 && dy < dst_rows)
+        {
+            dst[pos4.w]=uval.w;
+        }
+    }
 }

 __kernel void resizeLN_C4_D0(__global uchar4 * dst, __global uchar4 * src,
-                     int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel, 
+                     int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel,
                     int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify )
 {
    int dx = get_global_id(0);
@@ -186,10 +186,10 @@ __kernel void resizeLN_C4_D0(__global uchar4 * dst, __global uchar4 * src,
    x>=src_cols ? x=src_cols-1,u=0 : x,u;
    y<0 ? y=0,v=0 : y,v;
    y>=src_rows ? y=src_rows-1,v=0 : y,v;
-    
+
    u = u * INTER_RESIZE_COEF_SCALE;
    v = v * INTER_RESIZE_COEF_SCALE;
-   
+
    int U = rint(u);
    int V = rint(v);
    int U1= rint(INTER_RESIZE_COEF_SCALE - u);
@@ -197,25 +197,25 @@ __kernel void resizeLN_C4_D0(__global uchar4 * dst, __global uchar4 * src,

    int y_ = INC(y,src_rows);
    int x_ = INC(x,src_cols);
-	int4 srcpos;
-	srcpos.x = mad24(y, srcstep_in_pixel, x+srcoffset_in_pixel);
-	srcpos.y = mad24(y, srcstep_in_pixel, x_+srcoffset_in_pixel);
-	srcpos.z = mad24(y_, srcstep_in_pixel, x+srcoffset_in_pixel);
-	srcpos.w = mad24(y_, srcstep_in_pixel, x_+srcoffset_in_pixel);
+    int4 srcpos;
+    srcpos.x = mad24(y, srcstep_in_pixel, x+srcoffset_in_pixel);
+    srcpos.y = mad24(y, srcstep_in_pixel, x_+srcoffset_in_pixel);
+    srcpos.z = mad24(y_, srcstep_in_pixel, x+srcoffset_in_pixel);
+    srcpos.w = mad24(y_, srcstep_in_pixel, x_+srcoffset_in_pixel);
    int4 data0 = convert_int4(src[srcpos.x]);
    int4 data1 = convert_int4(src[srcpos.y]);
    int4 data2 = convert_int4(src[srcpos.z]);
    int4 data3 = convert_int4(src[srcpos.w]);
    int4 val = mul24((int4)mul24(U1, V1) ,  data0) + mul24((int4)mul24(U, V1) ,  data1)
               +mul24((int4)mul24(U1, V) ,  data2)+mul24((int4)mul24(U, V) ,  data3);
-	int dstpos = mad24(dy, dststep_in_pixel, dx+dstoffset_in_pixel);
+    int dstpos = mad24(dy, dststep_in_pixel, dx+dstoffset_in_pixel);
    uchar4 uval =   convert_uchar4((val + (1<<(CAST_BITS-1)))>>CAST_BITS);
    if(dx>=0 && dx<dst_cols && dy>=0 && dy<dst_rows)
         dst[dstpos] = uval;
 }

 __kernel void resizeLN_C1_D5(__global float * dst, __global float * src,
-                     int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel, 
+                     int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel,
                     int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify )
 {
    int dx = get_global_id(0);
@@ -229,16 +229,16 @@ __kernel void resizeLN_C1_D5(__global float * dst, __global float * src,
    x>=src_cols ? x=src_cols-1,u=0 : x,u;
    y<0 ? y=0,v=0 : y,v;
    y>=src_rows ? y=src_rows-1,v=0 : y,v;
-    
+
    int y_ = INC(y,src_rows);
    int x_ = INC(x,src_cols);
-	float u1 = 1.f-u;
-	float v1 = 1.f-v;
-	int4 srcpos;
-	srcpos.x = mad24(y, srcstep_in_pixel, x+srcoffset_in_pixel);
-	srcpos.y = mad24(y, srcstep_in_pixel, x_+srcoffset_in_pixel);
-	srcpos.z = mad24(y_, srcstep_in_pixel, x+srcoffset_in_pixel);
-	srcpos.w = mad24(y_, srcstep_in_pixel, x_+srcoffset_in_pixel);
+    float u1 = 1.f-u;
+    float v1 = 1.f-v;
+    int4 srcpos;
+    srcpos.x = mad24(y, srcstep_in_pixel, x+srcoffset_in_pixel);
+    srcpos.y = mad24(y, srcstep_in_pixel, x_+srcoffset_in_pixel);
+    srcpos.z = mad24(y_, srcstep_in_pixel, x+srcoffset_in_pixel);
+    srcpos.w = mad24(y_, srcstep_in_pixel, x_+srcoffset_in_pixel);
    float data0 = src[srcpos.x];
    float data1 = src[srcpos.y];
    float data2 = src[srcpos.z];
@@ -248,13 +248,13 @@ __kernel void resizeLN_C1_D5(__global float * dst, __global float * src,
    float val2 = u1 *  data2 +
                u *  data3;
    float val = v1 * val1 + v * val2;
-	int dstpos = mad24(dy, dststep_in_pixel, dx+dstoffset_in_pixel);
+    int dstpos = mad24(dy, dststep_in_pixel, dx+dstoffset_in_pixel);
    if(dx>=0 && dx<dst_cols && dy>=0 && dy<dst_rows)
-         dst[dstpos] = val; 
+         dst[dstpos] = val;
 }

 __kernel void resizeLN_C4_D5(__global float4 * dst, __global float4 * src,
-                     int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel, 
+                     int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel,
                     int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify )
 {
    int dx = get_global_id(0);
@@ -268,43 +268,43 @@ __kernel void resizeLN_C4_D5(__global float4 * dst, __global float4 * src,
    x>=src_cols ? x=src_cols-1,u=0 : x;
    y<0 ? y=0,v=0 : y;
    y>=src_rows ? y=src_rows-1,v=0 : y;
-    
+
    int y_ = INC(y,src_rows);
    int x_ = INC(x,src_cols);
-	float u1 = 1.f-u;
-	float v1 = 1.f-v;
-	int4 srcpos;
-	srcpos.x = mad24(y, srcstep_in_pixel, x+srcoffset_in_pixel);
-	srcpos.y = mad24(y, srcstep_in_pixel, x_+srcoffset_in_pixel);
-	srcpos.z = mad24(y_, srcstep_in_pixel, x+srcoffset_in_pixel);
-	srcpos.w = mad24(y_, srcstep_in_pixel, x_+srcoffset_in_pixel);
+    float u1 = 1.f-u;
+    float v1 = 1.f-v;
+    int4 srcpos;
+    srcpos.x = mad24(y, srcstep_in_pixel, x+srcoffset_in_pixel);
+    srcpos.y = mad24(y, srcstep_in_pixel, x_+srcoffset_in_pixel);
+    srcpos.z = mad24(y_, srcstep_in_pixel, x+srcoffset_in_pixel);
+    srcpos.w = mad24(y_, srcstep_in_pixel, x_+srcoffset_in_pixel);
    float4 s_data1, s_data2, s_data3, s_data4;
    s_data1 = src[srcpos.x];
    s_data2 = src[srcpos.y];
    s_data3 = src[srcpos.z];
    s_data4 = src[srcpos.w];
    float4 val = u1 * v1 * s_data1 + u * v1 * s_data2
-			  +u1 * v *s_data3 + u * v *s_data4;
-	int dstpos = mad24(dy, dststep_in_pixel, dx+dstoffset_in_pixel);
+              +u1 * v *s_data3 + u * v *s_data4;
+    int dstpos = mad24(dy, dststep_in_pixel, dx+dstoffset_in_pixel);

    if(dx>=0 && dx<dst_cols && dy>=0 && dy<dst_rows)
-         dst[dstpos] = val; 
+         dst[dstpos] = val;
 }

 __kernel void resizeNN_C1_D0(__global uchar * dst, __global uchar * src,
-                     int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel, 
+                     int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel,
                     int src_cols, int src_rows, int dst_cols, int dst_rows, F ifx, F ify )
 {
    int gx = get_global_id(0);
    int dy = get_global_id(1);
-    
+
    gx = (gx<<2) - (dstoffset_in_pixel&3);
    //int4 GX = (int4)(gx, gx+1, gx+2, gx+3);
-    
+
    int4 sx;
    int sy;
    F ss1 = gx*ifx;
-    F ss2 = (gx+1)*ifx; 
+    F ss2 = (gx+1)*ifx;
    F ss3 = (gx+2)*ifx;
    F ss4 = (gx+3)*ifx;
    F s5 = dy * ify;
@@ -313,87 +313,87 @@ __kernel void resizeNN_C1_D0(__global uchar * dst, __global uchar * src,
    sx.s2 = min((int)floor(ss3), src_cols-1);
    sx.s3 = min((int)floor(ss4), src_cols-1);
    sy = min((int)floor(s5), src_rows-1);
-    
+
    uchar4 val;
    int4 pos = mad24((int4)sy, (int4)srcstep_in_pixel, sx+(int4)srcoffset_in_pixel);
    val.s0 = src[pos.s0];
    val.s1 = src[pos.s1];
    val.s2 = src[pos.s2];
    val.s3 = src[pos.s3];
-    
+
    //__global uchar4* d = (__global uchar4*)(dst + dstoffset_in_pixel + dy * dststep_in_pixel + gx);
    //uchar4 dVal = *d;
-	pos = mad24(dy, dststep_in_pixel, gx+dstoffset_in_pixel);
-	pos.y++;
-	pos.z+=2;
-	pos.w+=3;
+    pos = mad24(dy, dststep_in_pixel, gx+dstoffset_in_pixel);
+    pos.y++;
+    pos.z+=2;
+    pos.w+=3;

        int con = (gx >= 0 && gx+3 < dst_cols && dy >= 0 && dy < dst_rows && (dstoffset_in_pixel&3)==0);
-	if(con)
-	{
-		*(__global uchar4*)(dst + pos.x)=val;
-	}
-	else
-	{
-		if(gx >= 0 && gx < dst_cols && dy >= 0 && dy < dst_rows)
-		{
-			dst[pos.x]=val.x;
-		}
-		if(gx+1 >= 0 && gx+1 < dst_cols && dy >= 0 && dy < dst_rows)
-		{
-			dst[pos.y]=val.y;
-		}
-		if(gx+2 >= 0 && gx+2 < dst_cols && dy >= 0 && dy < dst_rows)
-		{
-			dst[pos.z]=val.z;
-		}
-		if(gx+3 >= 0 && gx+3 < dst_cols && dy >= 0 && dy < dst_rows)
-		{
-			dst[pos.w]=val.w;
-		}
-	}
+    if(con)
+    {
+        *(__global uchar4*)(dst + pos.x)=val;
+    }
+    else
+    {
+        if(gx >= 0 && gx < dst_cols && dy >= 0 && dy < dst_rows)
+        {
+            dst[pos.x]=val.x;
+        }
+        if(gx+1 >= 0 && gx+1 < dst_cols && dy >= 0 && dy < dst_rows)
+        {
+            dst[pos.y]=val.y;
+        }
+        if(gx+2 >= 0 && gx+2 < dst_cols && dy >= 0 && dy < dst_rows)
+        {
+            dst[pos.z]=val.z;
+        }
+        if(gx+3 >= 0 && gx+3 < dst_cols && dy >= 0 && dy < dst_rows)
+        {
+            dst[pos.w]=val.w;
+        }
+    }
 }

 __kernel void resizeNN_C4_D0(__global uchar4 * dst, __global uchar4 * src,
-                     int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel, 
+                     int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel,
                     int src_cols, int src_rows, int dst_cols, int dst_rows, F ifx, F ify )
 {
    int dx = get_global_id(0);
    int dy = get_global_id(1);
-    
+
    F s1 = dx*ifx;
    F s2 = dy*ify;
    int sx = fmin((float)floor(s1), (float)src_cols-1);
    int sy = fmin((float)floor(s2), (float)src_rows-1);
    int dpos = mad24(dy, dststep_in_pixel, dx + dstoffset_in_pixel);
    int spos = mad24(sy, srcstep_in_pixel, sx + srcoffset_in_pixel);
-    
+
    if(dx>=0 && dx<dst_cols && dy>=0 && dy<dst_rows)
        dst[dpos] = src[spos];
-   
+
 }

 __kernel void resizeNN_C1_D5(__global float * dst, __global float * src,
-                     int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel, 
+                     int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel,
                     int src_cols, int src_rows, int dst_cols, int dst_rows, F ifx, F ify )
 {
    int dx = get_global_id(0);
    int dy = get_global_id(1);
-    
+
    F s1 = dx*ifx;
    F s2 = dy*ify;
    int sx = fmin((float)floor(s1), (float)src_cols-1);
    int sy = fmin((float)floor(s2), (float)src_rows-1);

    int dpos = mad24(dy, dststep_in_pixel, dx + dstoffset_in_pixel);
-    int spos = mad24(sy, srcstep_in_pixel, sx + srcoffset_in_pixel);   
+    int spos = mad24(sy, srcstep_in_pixel, sx + srcoffset_in_pixel);
    if(dx>=0 && dx<dst_cols && dy>=0 && dy<dst_rows)
        dst[dpos] = src[spos];
-   
+
 }

 __kernel void resizeNN_C4_D5(__global float4 * dst, __global float4 * src,
-                     int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel, 
+                     int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel,
                     int src_cols, int src_rows, int dst_cols, int dst_rows, F ifx, F ify )
 {
    int dx = get_global_id(0);
@@ -406,9 +406,9 @@ __kernel void resizeNN_C4_D5(__global float4 * dst, __global float4 * src,
    int sy = min(s_row, src_rows-1);
    int dpos = mad24(dy, dststep_in_pixel, dx + dstoffset_in_pixel);
    int spos = mad24(sy, srcstep_in_pixel, sx + srcoffset_in_pixel);
-    
+
    if(dx>=0 && dx<dst_cols && dy>=0 && dy<dst_rows)
        dst[dpos] = src[spos];
-   
+
 }

--- a/modules/ocl/src/kernels/imgproc_threshold.cl
+++ b/modules/ocl/src/kernels/imgproc_threshold.cl
@@ -51,7 +51,7 @@
 // enum { THRESH_BINARY=0, THRESH_BINARY_INV=1, THRESH_TRUNC=2, THRESH_TOZERO=3,
 //       THRESH_TOZERO_INV=4, THRESH_MASK=7, THRESH_OTSU=8 };

-__kernel void threshold_C1_D0(__global const uchar * restrict src, __global uchar *dst, 
+__kernel void threshold_C1_D0(__global const uchar * restrict src, __global uchar *dst,
                              int src_offset, int src_step,
                              int dst_offset, int dst_rows, int dst_cols, int dst_step,
                              uchar thresh, uchar max_val, int thresh_type
@@ -60,15 +60,15 @@ __kernel void threshold_C1_D0(__global const uchar * restrict src, __global ucha
    int gx = get_global_id(0);
    const int gy = get_global_id(1);

-	int offset = (dst_offset & 15);
-	src_offset -= offset;
-	
-	int dstart = (gx << 4) - offset;
+    int offset = (dst_offset & 15);
+    src_offset -= offset;
+
+    int dstart = (gx << 4) - offset;
    if(dstart < dst_cols && gy < dst_rows)
    {
-   	 	uchar16 sdata = vload16(gx, src+src_offset+gy*src_step);
+        uchar16 sdata = vload16(gx, src+src_offset+gy*src_step);
        uchar16 ddata;
-		uchar16 zero = 0;
+        uchar16 zero = 0;
        switch (thresh_type)
        {
            case 0:
@@ -89,20 +89,20 @@ __kernel void threshold_C1_D0(__global const uchar * restrict src, __global ucha
            default:
                ddata = sdata;
        }
-	    int16 dpos = (int16)(dstart, dstart+1, dstart+2, dstart+3, dstart+4, dstart+5, dstart+6, dstart+7, dstart+8,
-		                     dstart+9, dstart+10, dstart+11, dstart+12, dstart+13, dstart+14, dstart+15);	
-		uchar16 dVal = *(__global uchar16*)(dst+dst_offset+gy*dst_step+dstart);
-		int16 con = dpos >= 0 && dpos < dst_cols;
-		ddata = convert_uchar16(con != 0) ? ddata : dVal;
-		if(dstart < dst_cols)
-		{
-			*(__global uchar16*)(dst+dst_offset+gy*dst_step+dstart) = ddata;
-		}
+        int16 dpos = (int16)(dstart, dstart+1, dstart+2, dstart+3, dstart+4, dstart+5, dstart+6, dstart+7, dstart+8,
+                             dstart+9, dstart+10, dstart+11, dstart+12, dstart+13, dstart+14, dstart+15);
+        uchar16 dVal = *(__global uchar16*)(dst+dst_offset+gy*dst_step+dstart);
+        int16 con = dpos >= 0 && dpos < dst_cols;
+        ddata = convert_uchar16(con != 0) ? ddata : dVal;
+        if(dstart < dst_cols)
+        {
+            *(__global uchar16*)(dst+dst_offset+gy*dst_step+dstart) = ddata;
+        }
    }
 }


-__kernel void threshold_C1_D5(__global const float * restrict src, __global float *dst, 
+__kernel void threshold_C1_D5(__global const float * restrict src, __global float *dst,
                              int src_offset, int src_step,
                              int dst_offset, int dst_rows, int dst_cols, int dst_step,
                              float thresh, float max_val, int thresh_type
@@ -110,16 +110,16 @@ __kernel void threshold_C1_D5(__global const float * restrict src, __global floa
 {
    const int gx = get_global_id(0);
    const int gy = get_global_id(1);
-    
-	int offset = (dst_offset & 3);
-	src_offset -= offset;
-	
-	int dstart = (gx << 2) - offset;
+
+    int offset = (dst_offset & 3);
+    src_offset -= offset;
+
+    int dstart = (gx << 2) - offset;
    if(dstart < dst_cols && gy < dst_rows)
    {
        float4 sdata = vload4(gx, src+src_offset+gy*src_step);
        float4 ddata;
-		float4 zero = 0;
+        float4 zero = 0;
        switch (thresh_type)
        {
            case 0:
@@ -140,14 +140,14 @@ __kernel void threshold_C1_D5(__global const float * restrict src, __global floa
            default:
                ddata = sdata;
        }
-	    int4 dpos = (int4)(dstart, dstart+1, dstart+2, dstart+3);
-		float4 dVal = *(__global float4*)(dst+dst_offset+gy*dst_step+dstart);
-		int4 con = dpos >= 0 && dpos < dst_cols;
-		ddata = convert_float4(con) != 0 ? ddata : dVal;
-		if(dstart < dst_cols)
-		{
-			*(__global float4*)(dst+dst_offset+gy*dst_step+dstart) = ddata;
-		}
+        int4 dpos = (int4)(dstart, dstart+1, dstart+2, dstart+3);
+        float4 dVal = *(__global float4*)(dst+dst_offset+gy*dst_step+dstart);
+        int4 con = dpos >= 0 && dpos < dst_cols;
+        ddata = convert_float4(con) != 0 ? ddata : dVal;
+        if(dstart < dst_cols)
+        {
+            *(__global float4*)(dst+dst_offset+gy*dst_step+dstart) = ddata;
+        }
    }
 }

--- a/modules/ocl/src/kernels/imgproc_warpAffine.cl
+++ b/modules/ocl/src/kernels/imgproc_warpAffine.cl
@@ -52,7 +52,7 @@
 typedef double F;
 typedef double4 F4;
 #define convert_F4 convert_double4
-#else 
+#else
 typedef float F;
 typedef float4 F4;
 #define convert_F4 convert_float4
@@ -61,9 +61,9 @@ typedef float4 F4;

 #define INTER_BITS 5
 #define INTER_TAB_SIZE (1 << INTER_BITS)
-#define INTER_SCALE 1.f/INTER_TAB_SIZE 
-#define AB_BITS max(10, (int)INTER_BITS) 
-#define AB_SCALE (1 << AB_BITS) 
+#define INTER_SCALE 1.f/INTER_TAB_SIZE
+#define AB_BITS max(10, (int)INTER_BITS)
+#define AB_SCALE (1 << AB_BITS)
 #define INTER_REMAP_COEF_BITS 15
 #define INTER_REMAP_COEF_SCALE (1 << INTER_REMAP_COEF_BITS)

@@ -81,7 +81,7 @@ inline void interpolateCubic( float x, float* coeffs )
 /**********************************************8UC1*********************************************
 ***********************************************************************************************/
 __kernel void warpAffineNN_C1_D0(__global uchar const * restrict src, __global uchar * dst, int src_cols, int src_rows,
-                            int dst_cols, int dst_rows, int srcStep, int dstStep, 
+                            int dst_cols, int dst_rows, int srcStep, int dstStep,
                            int src_offset, int dst_offset,  __constant F * M, int threadCols )
 {
    int dx = get_global_id(0);
@@ -90,9 +90,9 @@ __kernel void warpAffineNN_C1_D0(__global uchar const * restrict src, __global u
    if( dx < threadCols && dy < dst_rows)
    {
        dx = (dx<<2) - (dst_offset&3);
-    
+
        int round_delta = (AB_SCALE>>1);
-      
+
        int4 X, Y;
        int4 sx, sy;
        int4 DX = (int4)(dx, dx+1, dx+2, dx+3);
@@ -105,13 +105,13 @@ __kernel void warpAffineNN_C1_D0(__global uchar const * restrict src, __global u
        int tmp1, tmp2;
        tmp1 = rint((M[1]*dy + M[2]) * AB_SCALE);
        tmp2 = rint((M[4]*dy + M[5]) * AB_SCALE);
-         
+
        X += tmp1 + round_delta;
        Y += tmp2 + round_delta;
-       
+
        sx = convert_int4(convert_short4(X >> AB_BITS));
        sy = convert_int4(convert_short4(Y >> AB_BITS));
-        
+
        __global uchar4 * d = (__global uchar4 *)(dst+dst_offset+dy*dstStep+dx);
        uchar4 dval = *d;
        DX = (int4)(dx, dx+1, dx+2, dx+3);
@@ -129,7 +129,7 @@ __kernel void warpAffineNN_C1_D0(__global uchar const * restrict src, __global u
 }

 __kernel void warpAffineLinear_C1_D0(__global const uchar * restrict src, __global uchar * dst, int src_cols, int src_rows,
-                            int dst_cols, int dst_rows, int srcStep, int dstStep, 
+                            int dst_cols, int dst_rows, int srcStep, int dstStep,
                            int src_offset, int dst_offset,  __constant F * M, int threadCols )
 {
    int dx = get_global_id(0);
@@ -139,9 +139,9 @@ __kernel void warpAffineLinear_C1_D0(__global const uchar * restrict src, __glob
    if( dx < threadCols && dy < dst_rows)
    {
        dx = (dx<<2) - (dst_offset&3);
-         
+
        int round_delta = ((AB_SCALE >> INTER_BITS) >> 1);
-       
+
        int4 X, Y;
        short4  ax, ay;
        int4 sx, sy;
@@ -152,22 +152,22 @@ __kernel void warpAffineLinear_C1_D0(__global const uchar * restrict src, __glob
        M3DX = M[3] * convert_F4(DX);
        X = convert_int4(rint(M0DX));
        Y = convert_int4(rint(M3DX));
-        
+
        int tmp1, tmp2;
        tmp1 = rint((M[1]*dy + M[2]) * AB_SCALE);
        tmp2 = rint((M[4]*dy + M[5]) * AB_SCALE);
-         
+
        X += tmp1 + round_delta;
        Y += tmp2 + round_delta;
-       
+
        X = X >> (AB_BITS - INTER_BITS);
        Y = Y >> (AB_BITS - INTER_BITS);
-       
+
        sx = convert_int4(convert_short4(X >> INTER_BITS));
        sy = convert_int4(convert_short4(Y >> INTER_BITS));
        ax = convert_short4(X & (INTER_TAB_SIZE-1));
        ay = convert_short4(Y & (INTER_TAB_SIZE-1));
-        
+
        uchar4 v0, v1, v2,v3;
        int4 scon0, scon1, scon2, scon3;
        int4 spos0, spos1, spos2, spos3;
@@ -200,12 +200,12 @@ __kernel void warpAffineLinear_C1_D0(__global const uchar * restrict src, __glob
        v1.s3 = scon1.s3 ? src[spos1.s3] : 0;
        v2.s3 = scon2.s3 ? src[spos2.s3] : 0;
        v3.s3 = scon3.s3 ? src[spos3.s3] : 0;
-       
+
        short4 itab0, itab1, itab2, itab3;
        float4 taby, tabx;
        taby = INTER_SCALE * convert_float4(ay);
        tabx = INTER_SCALE * convert_float4(ax);
-     
+
        itab0 = convert_short4_sat(( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ));
        itab1 = convert_short4_sat(( (1.0f-taby)*tabx * INTER_REMAP_COEF_SCALE ));
        itab2 = convert_short4_sat(( taby*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ));
@@ -214,30 +214,30 @@ __kernel void warpAffineLinear_C1_D0(__global const uchar * restrict src, __glob

        int4 val;
        uchar4 tval;
-        val = convert_int4(v0) * convert_int4(itab0) + convert_int4(v1) * convert_int4(itab1) 
+        val = convert_int4(v0) * convert_int4(itab0) + convert_int4(v1) * convert_int4(itab1)
            + convert_int4(v2) * convert_int4(itab2) + convert_int4(v3) * convert_int4(itab3);
        tval = convert_uchar4_sat ( (val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ;
-        
+
        __global uchar4 * d =(__global uchar4 *)(dst+dst_offset+dy*dstStep+dx);
        uchar4 dval = *d;
        DX = (int4)(dx, dx+1, dx+2, dx+3);
        int4 dcon = DX >= 0 && DX < dst_cols && dy >= 0 && dy < dst_rows;
        dval = convert_uchar4(dcon != 0) ? tval : dval;
        *d = dval;
-    } 
+    }
 }

 __kernel void warpAffineCubic_C1_D0(__global uchar * src, __global uchar * dst, int src_cols, int src_rows,
-                            int dst_cols, int dst_rows, int srcStep, int dstStep, 
+                            int dst_cols, int dst_rows, int srcStep, int dstStep,
                            int src_offset, int dst_offset,  __constant F * M, int threadCols )
 {
    int dx = get_global_id(0);
    int dy = get_global_id(1);
-    
+
    if( dx < threadCols && dy < dst_rows)
    {
        int round_delta = ((AB_SCALE>>INTER_BITS)>>1);
-        
+
        int X0 = rint(M[0] * dx * AB_SCALE);
        int Y0 = rint(M[3] * dx * AB_SCALE);
        X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
@@ -249,10 +249,10 @@ __kernel void warpAffineCubic_C1_D0(__global uchar * src, __global uchar * dst,
        short sy = (short)(Y >> INTER_BITS) - 1;
        short ay = (short)(Y & (INTER_TAB_SIZE-1));
        short ax = (short)(X & (INTER_TAB_SIZE-1));
-        
+
        uchar v[16];
        int i, j;
-       
+
 #pragma unroll 4
        for(i=0; i<4;  i++)
        for(j=0; j<4;  j++)
@@ -269,14 +269,14 @@ __kernel void warpAffineCubic_C1_D0(__global uchar * src, __global uchar * dst,
        interpolateCubic(ayy, tab1y);
        interpolateCubic(axx, tab1x);
        int isum = 0;
-        
+
 #pragma unroll 16
        for( i=0; i<16; i++ )
        {
            F v = tab1y[(i>>2)] * tab1x[(i&3)];
            isum += itab[i] = convert_short_sat( rint( v * INTER_REMAP_COEF_SCALE ) );
        }
-        
+
        if( isum != INTER_REMAP_COEF_SCALE )
        {
            int k1, k2;
@@ -309,16 +309,16 @@ __kernel void warpAffineCubic_C1_D0(__global uchar * src, __global uchar * dst,
 ***********************************************************************************************/

 __kernel void warpAffineNN_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, int src_cols, int src_rows,
-                            int dst_cols, int dst_rows, int srcStep, int dstStep, 
+                            int dst_cols, int dst_rows, int srcStep, int dstStep,
                            int src_offset, int dst_offset,  __constant F * M, int threadCols )
 {
    int dx = get_global_id(0);
    int dy = get_global_id(1);
-    
+
    if( dx < threadCols && dy < dst_rows)
    {
        int round_delta = (AB_SCALE >> 1);
-        
+
        int X0 = rint(M[0] * dx * AB_SCALE);
        int Y0 = rint(M[3] * dx * AB_SCALE);
        X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
@@ -326,26 +326,26 @@ __kernel void warpAffineNN_C4_D0(__global uchar4 const * restrict src, __global

        int sx0 = (short)(X0 >> AB_BITS);
        int sy0 = (short)(Y0 >> AB_BITS);
-     
+
        if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
-            dst[(dst_offset>>2)+dy*(dstStep>>2)+dx]= (sx0>=0 && sx0<src_cols && sy0>=0 && sy0<src_rows) ? src[(src_offset>>2)+sy0*(srcStep>>2)+sx0] : (uchar4)0; 
+            dst[(dst_offset>>2)+dy*(dstStep>>2)+dx]= (sx0>=0 && sx0<src_cols && sy0>=0 && sy0<src_rows) ? src[(src_offset>>2)+sy0*(srcStep>>2)+sx0] : (uchar4)0;
    }
 }

 __kernel void warpAffineLinear_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, int src_cols, int src_rows,
-                            int dst_cols, int dst_rows, int srcStep, int dstStep, 
+                            int dst_cols, int dst_rows, int srcStep, int dstStep,
                            int src_offset, int dst_offset,  __constant F * M, int threadCols )
 {
    int dx = get_global_id(0);
    int dy = get_global_id(1);

-        
+
    if( dx < threadCols && dy < dst_rows)
    {
        int round_delta = AB_SCALE/INTER_TAB_SIZE/2;
-        
+
        src_offset = (src_offset>>2);
-        srcStep = (srcStep>>2); 
+        srcStep = (srcStep>>2);

        int tmp = (dx << AB_BITS);
        int X0 = rint(M[0] * tmp);
@@ -359,7 +359,7 @@ __kernel void warpAffineLinear_C4_D0(__global uchar4 const * restrict src, __glo
        short sy0 = (short)(Y0 >> INTER_BITS);
        short ax0 = (short)(X0 & (INTER_TAB_SIZE-1));
        short ay0 = (short)(Y0 & (INTER_TAB_SIZE-1));
-        
+
        int4 v0, v1, v2, v3;

        v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? convert_int4(src[src_offset+sy0 * srcStep + sx0]) : 0;
@@ -371,36 +371,36 @@ __kernel void warpAffineLinear_C4_D0(__global uchar4 const * restrict src, __glo
        float taby, tabx;
        taby = 1.f/INTER_TAB_SIZE*ay0;
        tabx = 1.f/INTER_TAB_SIZE*ax0;
-        
+
        itab0 = convert_short_sat(rint( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ));
        itab1 = convert_short_sat(rint( (1.0f-taby)*tabx * INTER_REMAP_COEF_SCALE ));
        itab2 = convert_short_sat(rint( taby*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ));
        itab3 = convert_short_sat(rint( taby*tabx * INTER_REMAP_COEF_SCALE ));
-        
+
        int4 val;
        val = v0 * itab0 +  v1 * itab1 + v2 * itab2 + v3 * itab3;
-            
+
        if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
            dst[(dst_offset>>2)+dy*(dstStep>>2)+dx] =  convert_uchar4_sat ( (val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ;
    }
 }

 __kernel void warpAffineCubic_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, int src_cols, int src_rows,
-                            int dst_cols, int dst_rows, int srcStep, int dstStep, 
+                            int dst_cols, int dst_rows, int srcStep, int dstStep,
                            int src_offset, int dst_offset,  __constant F * M, int threadCols )
 {
    int dx = get_global_id(0);
    int dy = get_global_id(1);
-    
+
    if( dx < threadCols && dy < dst_rows)
    {
        int round_delta = ((AB_SCALE>>INTER_BITS)>>1);
-        
+
        src_offset = (src_offset>>2);
-        srcStep = (srcStep>>2); 
+        srcStep = (srcStep>>2);
        dst_offset = (dst_offset>>2);
-        dstStep = (dstStep>>2); 
-       
+        dstStep = (dstStep>>2);
+
        int tmp = (dx << AB_BITS);
        int X0 = rint(M[0] * tmp);
        int Y0 = rint(M[3] * tmp);
@@ -413,7 +413,7 @@ __kernel void warpAffineCubic_C4_D0(__global uchar4 const * restrict src, __glob
        int sy = (short)(Y0 >> INTER_BITS) - 1;
        int ay = (short)(Y0 & (INTER_TAB_SIZE-1));
        int ax = (short)(X0 & (INTER_TAB_SIZE-1));
-        
+
        uchar4 v[16];
        int i,j;
 #pragma unroll 4
@@ -431,7 +431,7 @@ __kernel void warpAffineCubic_C4_D0(__global uchar4 const * restrict src, __glob
        interpolateCubic(ayy, tab1y);
        interpolateCubic(axx, tab1x);
        int isum = 0;
-        
+
 #pragma unroll 16
        for( i=0; i<16; i++ )
        {
@@ -446,17 +446,17 @@ __kernel void warpAffineCubic_C4_D0(__global uchar4 const * restrict src, __glob
            int k1, k2;
            int diff = isum - INTER_REMAP_COEF_SCALE;
            int Mk1=2, Mk2=2, mk1=2, mk2=2;
-            
+
               for( k1 = 2; k1 < 4; k1++ )
                for( k2 = 2; k2 < 4; k2++ )
                {
-                    
+
                    if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] )
                        mk1 = k1, mk2 = k2;
                    else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] )
                         Mk1 = k1, Mk2 = k2;
                }
-                
+
            diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff));
        }

@@ -477,16 +477,16 @@ __kernel void warpAffineCubic_C4_D0(__global uchar4 const * restrict src, __glob
 ***********************************************************************************************/

 __kernel void warpAffineNN_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
-                            int dst_cols, int dst_rows, int srcStep, int dstStep, 
+                            int dst_cols, int dst_rows, int srcStep, int dstStep,
                            int src_offset, int dst_offset,  __constant F * M, int threadCols )
 {
    int dx = get_global_id(0);
    int dy = get_global_id(1);
-    
+
    if( dx < threadCols && dy < dst_rows)
    {
        int round_delta = AB_SCALE/2;
-        
+
        int X0 = rint(M[0] * dx * AB_SCALE);
        int Y0 = rint(M[3] * dx * AB_SCALE);
        X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
@@ -494,25 +494,25 @@ __kernel void warpAffineNN_C1_D5(__global float * src, __global float * dst, int

        short sx0 = (short)(X0 >> AB_BITS);
        short sy0 = (short)(Y0 >> AB_BITS);
-        
+
        if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
-            dst[(dst_offset>>2)+dy*dstStep+dx]= (sx0>=0 && sx0<src_cols && sy0>=0 && sy0<src_rows) ? src[(src_offset>>2)+sy0*srcStep+sx0] : 0; 
+            dst[(dst_offset>>2)+dy*dstStep+dx]= (sx0>=0 && sx0<src_cols && sy0>=0 && sy0<src_rows) ? src[(src_offset>>2)+sy0*srcStep+sx0] : 0;
    }
 }

 __kernel void warpAffineLinear_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
-                            int dst_cols, int dst_rows, int srcStep, int dstStep, 
+                            int dst_cols, int dst_rows, int srcStep, int dstStep,
                            int src_offset, int dst_offset,  __constant F * M, int threadCols )
 {
    int dx = get_global_id(0);
    int dy = get_global_id(1);
-    
+
    if( dx < threadCols && dy < dst_rows)
    {
        int round_delta = AB_SCALE/INTER_TAB_SIZE/2;
-        
+
        src_offset = (src_offset>>2);
-        
+
        int X0 = rint(M[0] * dx * AB_SCALE);
        int Y0 = rint(M[3] * dx * AB_SCALE);
        X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
@@ -524,7 +524,7 @@ __kernel void warpAffineLinear_C1_D5(__global float * src, __global float * dst,
        short sy0 = (short)(Y0 >> INTER_BITS);
        short ax0 = (short)(X0 & (INTER_TAB_SIZE-1));
        short ay0 = (short)(Y0 & (INTER_TAB_SIZE-1));
-        
+
        float v0, v1, v2, v3;

        v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : 0;
@@ -538,33 +538,33 @@ __kernel void warpAffineLinear_C1_D5(__global float * src, __global float * dst,
        taby[1] = 1.f/INTER_TAB_SIZE*ay0;
        tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax0;
        tabx[1] = 1.f/INTER_TAB_SIZE*ax0;
-       
+
        tab[0] = taby[0] * tabx[0];
        tab[1] = taby[0] * tabx[1];
        tab[2] = taby[1] * tabx[0];
        tab[3] = taby[1] * tabx[1];

        float sum = 0;
-        sum += v0 * tab[0] +  v1 * tab[1] +  v2 * tab[2] +  v3 * tab[3]; 
+        sum += v0 * tab[0] +  v1 * tab[1] +  v2 * tab[2] +  v3 * tab[3];
        if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
            dst[(dst_offset>>2)+dy*dstStep+dx] = sum;
   }
 }
-    
+
 __kernel void warpAffineCubic_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
-                            int dst_cols, int dst_rows, int srcStep, int dstStep, 
+                            int dst_cols, int dst_rows, int srcStep, int dstStep,
                            int src_offset, int dst_offset,  __constant F * M, int threadCols )
 {
    int dx = get_global_id(0);
    int dy = get_global_id(1);
-    
+
    if( dx < threadCols && dy < dst_rows)
    {
        int round_delta = AB_SCALE/INTER_TAB_SIZE/2;
-            
+
        src_offset = (src_offset>>2);
        dst_offset = (dst_offset>>2);
-        
+
        int X0 = rint(M[0] * dx * AB_SCALE);
        int Y0 = rint(M[3] * dx * AB_SCALE);
        X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
@@ -576,7 +576,7 @@ __kernel void warpAffineCubic_C1_D5(__global float * src, __global float * dst,
        short sy = (short)(Y0 >> INTER_BITS) - 1;
        short ay = (short)(Y0 & (INTER_TAB_SIZE-1));
        short ax = (short)(X0 & (INTER_TAB_SIZE-1));
-        
+
        float v[16];
        int i;

@@ -597,7 +597,7 @@ __kernel void warpAffineCubic_C1_D5(__global float * src, __global float * dst,
        {
            tab[i] = tab1y[(i>>2)] * tab1x[(i&3)];
        }
-        
+
        if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
        {
            float sum = 0;
@@ -617,16 +617,16 @@ __kernel void warpAffineCubic_C1_D5(__global float * src, __global float * dst,
 ***********************************************************************************************/

 __kernel void warpAffineNN_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows,
-                            int dst_cols, int dst_rows, int srcStep, int dstStep, 
+                            int dst_cols, int dst_rows, int srcStep, int dstStep,
                            int src_offset, int dst_offset,  __constant F * M, int threadCols )
 {
    int dx = get_global_id(0);
    int dy = get_global_id(1);
-    
+
    if( dx < threadCols && dy < dst_rows)
    {
        int round_delta = AB_SCALE/2;
-        
+
        int X0 = rint(M[0] * dx * AB_SCALE);
        int Y0 = rint(M[3] * dx * AB_SCALE);
        X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
@@ -634,28 +634,28 @@ __kernel void warpAffineNN_C4_D5(__global float4 * src, __global float4 * dst, i

        short sx0 = (short)(X0 >> AB_BITS);
        short sy0 = (short)(Y0 >> AB_BITS);
-        
+
        if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
-            dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx0>=0 && sx0<src_cols && sy0>=0 && sy0<src_rows) ? src[(src_offset>>4)+sy0*(srcStep>>2)+sx0] : 0; 
+            dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx0>=0 && sx0<src_cols && sy0>=0 && sy0<src_rows) ? src[(src_offset>>4)+sy0*(srcStep>>2)+sx0] : 0;
    }
 }

 __kernel void warpAffineLinear_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows,
-                            int dst_cols, int dst_rows, int srcStep, int dstStep, 
+                            int dst_cols, int dst_rows, int srcStep, int dstStep,
                            int src_offset, int dst_offset,  __constant F * M, int threadCols )
 {
    int dx = get_global_id(0);
    int dy = get_global_id(1);
-    
+
    if( dx < threadCols && dy < dst_rows)
    {
        int round_delta = AB_SCALE/INTER_TAB_SIZE/2;
-        
+
        src_offset = (src_offset>>4);
        dst_offset = (dst_offset>>4);
        srcStep = (srcStep>>2);
        dstStep = (dstStep>>2);
-        
+
        int X0 = rint(M[0] * dx * AB_SCALE);
        int Y0 = rint(M[3] * dx * AB_SCALE);
        X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
@@ -667,7 +667,7 @@ __kernel void warpAffineLinear_C4_D5(__global float4 * src, __global float4 * ds
        short sy0 = (short)(Y0 >> INTER_BITS);
        short ax0 = (short)(X0 & (INTER_TAB_SIZE-1));
        short ay0 = (short)(Y0 & (INTER_TAB_SIZE-1));
-        
+
        float4 v0, v1, v2, v3;

        v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : 0;
@@ -681,35 +681,35 @@ __kernel void warpAffineLinear_C4_D5(__global float4 * src, __global float4 * ds
        taby[1] = 1.f/INTER_TAB_SIZE*ay0;
        tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax0;
        tabx[1] = 1.f/INTER_TAB_SIZE*ax0;
-       
+
        tab[0] = taby[0] * tabx[0];
        tab[1] = taby[0] * tabx[1];
        tab[2] = taby[1] * tabx[0];
        tab[3] = taby[1] * tabx[1];

        float4 sum = 0;
-        sum += v0 * tab[0] +  v1 * tab[1] +  v2 * tab[2] +  v3 * tab[3]; 
+        sum += v0 * tab[0] +  v1 * tab[1] +  v2 * tab[2] +  v3 * tab[3];
        if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
            dst[dst_offset+dy*dstStep+dx] = sum;
  }
 }
-    
+
 __kernel void warpAffineCubic_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows,
-                            int dst_cols, int dst_rows, int srcStep, int dstStep, 
+                            int dst_cols, int dst_rows, int srcStep, int dstStep,
                            int src_offset, int dst_offset,  __constant F * M, int threadCols )
 {
    int dx = get_global_id(0);
    int dy = get_global_id(1);
-    
+
    if( dx < threadCols && dy < dst_rows)
    {
        int round_delta = AB_SCALE/INTER_TAB_SIZE/2;
-        
+
        src_offset = (src_offset>>4);
        dst_offset = (dst_offset>>4);
        srcStep = (srcStep>>2);
        dstStep = (dstStep>>2);
-        
+
        int X0 = rint(M[0] * dx * AB_SCALE);
        int Y0 = rint(M[3] * dx * AB_SCALE);
        X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
@@ -721,7 +721,7 @@ __kernel void warpAffineCubic_C4_D5(__global float4 * src, __global float4 * dst
        short sy = (short)(Y0 >> INTER_BITS) - 1;
        short ay = (short)(Y0 & (INTER_TAB_SIZE-1));
        short ax = (short)(X0 & (INTER_TAB_SIZE-1));
-        
+
        float4 v[16];
        int i;

@@ -742,7 +742,7 @@ __kernel void warpAffineCubic_C4_D5(__global float4 * src, __global float4 * dst
        {
            tab[i] = tab1y[(i>>2)] * tab1x[(i&3)];
        }
-        
+
        if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
        {
            float4 sum = 0;
--- a/modules/ocl/src/kernels/imgproc_warpPerspective.cl
+++ b/modules/ocl/src/kernels/imgproc_warpPerspective.cl
@@ -52,7 +52,7 @@
 typedef double F;
 typedef double4 F4;
 #define convert_F4 convert_double4
-#else 
+#else
 typedef float F;
 typedef float4 F4;
 #define convert_F4 convert_float4
@@ -61,9 +61,9 @@ typedef float4 F4;

 #define INTER_BITS 5
 #define INTER_TAB_SIZE (1 << INTER_BITS)
-#define INTER_SCALE 1.f/INTER_TAB_SIZE 
-#define AB_BITS max(10, (int)INTER_BITS) 
-#define AB_SCALE (1 << AB_BITS) 
+#define INTER_SCALE 1.f/INTER_TAB_SIZE
+#define AB_BITS max(10, (int)INTER_BITS)
+#define AB_SCALE (1 << AB_BITS)
 #define INTER_REMAP_COEF_BITS 15
 #define INTER_REMAP_COEF_SCALE (1 << INTER_REMAP_COEF_BITS)

@@ -81,7 +81,7 @@ inline void interpolateCubic( float x, float* coeffs )
 /**********************************************8UC1*********************************************
 ***********************************************************************************************/
 __kernel void warpPerspectiveNN_C1_D0(__global uchar const * restrict src, __global uchar * dst, int src_cols, int src_rows,
-                            int dst_cols, int dst_rows, int srcStep, int dstStep, 
+                            int dst_cols, int dst_rows, int srcStep, int dstStep,
                            int src_offset, int dst_offset,  __constant F * M, int threadCols )
 {
    int dx = get_global_id(0);
@@ -90,7 +90,7 @@ __kernel void warpPerspectiveNN_C1_D0(__global uchar const * restrict src, __glo
    if( dx < threadCols && dy < dst_rows)
    {
        dx = (dx<<2) - (dst_offset&3);
-        
+
        F4 DX = (F4)(dx, dx+1, dx+2, dx+3);
        F4 X0 = M[0]*DX + M[1]*dy + M[2];
        F4 Y0 = M[3]*DX + M[4]*dy + M[5];
@@ -118,12 +118,12 @@ __kernel void warpPerspectiveNN_C1_D0(__global uchar const * restrict src, __glo
 }

 __kernel void warpPerspectiveLinear_C1_D0(__global const uchar * restrict src, __global uchar * dst,
-                            int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, 
+                            int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
                            int dstStep, int src_offset, int dst_offset,  __constant F * M, int threadCols )
 {
    int dx = get_global_id(0);
    int dy = get_global_id(1);
- 
+
    if( dx < threadCols && dy < dst_rows)
    {
        F X0 = M[0]*dx + M[1]*dy + M[2];
@@ -132,12 +132,12 @@ __kernel void warpPerspectiveLinear_C1_D0(__global const uchar * restrict src, _
        W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
        int X = rint(X0*W);
        int Y = rint(Y0*W);
-        
+
        int sx = (short)(X >> INTER_BITS);
        int sy = (short)(Y >> INTER_BITS);
        int ay = (short)(Y & (INTER_TAB_SIZE-1));
        int ax = (short)(X & (INTER_TAB_SIZE-1));
-       
+
        uchar v[4];
        int i;
 #pragma unroll 4
@@ -150,7 +150,7 @@ __kernel void warpPerspectiveLinear_C1_D0(__global const uchar * restrict src, _
        tab1y[1] = 1.f/INTER_TAB_SIZE*ay;
        tab1x[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax;
        tab1x[1] = 1.f/INTER_TAB_SIZE*ax;
-        
+
 #pragma unroll 4
        for(i=0; i<4;  i++)
        {
@@ -170,12 +170,12 @@ __kernel void warpPerspectiveLinear_C1_D0(__global const uchar * restrict src, _
 }

 __kernel void warpPerspectiveCubic_C1_D0(__global uchar * src, __global uchar * dst, int src_cols, int src_rows,
-                            int dst_cols, int dst_rows, int srcStep, int dstStep, 
+                            int dst_cols, int dst_rows, int srcStep, int dstStep,
                            int src_offset, int dst_offset,  __constant F * M, int threadCols )
 {
    int dx = get_global_id(0);
    int dy = get_global_id(1);
- 	
+
    if( dx < threadCols && dy < dst_rows)
    {
        F X0 = M[0]*dx + M[1]*dy + M[2];
@@ -184,15 +184,15 @@ __kernel void warpPerspectiveCubic_C1_D0(__global uchar * src, __global uchar *
        W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
        int X = rint(X0*W);
        int Y = rint(Y0*W);
-        
+
        short sx = (short)(X >> INTER_BITS) - 1;
        short sy = (short)(Y >> INTER_BITS) - 1;
        short ay = (short)(Y & (INTER_TAB_SIZE-1));
        short ax = (short)(X & (INTER_TAB_SIZE-1));
-     
+
      uchar v[16];
        int i, j;
-       
+
 #pragma unroll 4
        for(i=0; i<4;  i++)
        for(j=0; j<4;  j++)
@@ -208,7 +208,7 @@ __kernel void warpPerspectiveCubic_C1_D0(__global uchar * src, __global uchar *
        axx = 1.f/INTER_TAB_SIZE * ax;
        interpolateCubic(ayy, tab1y);
        interpolateCubic(axx, tab1x);
-        
+
        int isum = 0;
 #pragma unroll 16
        for( i=0; i<16; i++ )
@@ -249,12 +249,12 @@ __kernel void warpPerspectiveCubic_C1_D0(__global uchar * src, __global uchar *
 ***********************************************************************************************/

 __kernel void warpPerspectiveNN_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst,
-                            int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, 
-							int dstStep, int src_offset, int dst_offset,  __constant F * M, int threadCols )
+                            int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
+                            int dstStep, int src_offset, int dst_offset,  __constant F * M, int threadCols )
 {
    int dx = get_global_id(0);
    int dy = get_global_id(1);
-  
+
    if( dx < threadCols && dy < dst_rows)
    {

@@ -266,37 +266,37 @@ __kernel void warpPerspectiveNN_C4_D0(__global uchar4 const * restrict src, __gl
        int Y = rint(Y0*W);
        short sx = (short)X;
        short sy = (short)Y;
-     
+
        if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
-            dst[(dst_offset>>2)+dy*(dstStep>>2)+dx]= (sx>=0 && sx<src_cols && sy>=0 && sy<src_rows) ? src[(src_offset>>2)+sy*(srcStep>>2)+sx] : (uchar4)0; 
+            dst[(dst_offset>>2)+dy*(dstStep>>2)+dx]= (sx>=0 && sx<src_cols && sy>=0 && sy<src_rows) ? src[(src_offset>>2)+sy*(srcStep>>2)+sx] : (uchar4)0;
    }
 }

 __kernel void warpPerspectiveLinear_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst,
-						   	int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
-							int dstStep, int src_offset, int dst_offset,  __constant F * M, int threadCols )
+                            int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
+                            int dstStep, int src_offset, int dst_offset,  __constant F * M, int threadCols )
 {
    int dx = get_global_id(0);
    int dy = get_global_id(1);
-    
+
    if( dx < threadCols && dy < dst_rows)
    {
        src_offset = (src_offset>>2);
-        srcStep = (srcStep>>2); 
-     
+        srcStep = (srcStep>>2);
+
        F X0 = M[0]*dx + M[1]*dy + M[2];
        F Y0 = M[3]*dx + M[4]*dy + M[5];
        F W = M[6]*dx + M[7]*dy + M[8];
        W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
        int X = rint(X0*W);
        int Y = rint(Y0*W);
-        
+
        short sx = (short)(X >> INTER_BITS);
        short sy = (short)(Y >> INTER_BITS);
        short ay = (short)(Y & (INTER_TAB_SIZE-1));
        short ax = (short)(X & (INTER_TAB_SIZE-1));
-        
-        
+
+
        int4 v0, v1, v2, v3;

        v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? convert_int4(src[src_offset+sy * srcStep + sx]) : 0;
@@ -308,46 +308,46 @@ __kernel void warpPerspectiveLinear_C4_D0(__global uchar4 const * restrict src,
        float taby, tabx;
        taby = 1.f/INTER_TAB_SIZE*ay;
        tabx = 1.f/INTER_TAB_SIZE*ax;
-        
+
        itab0 = convert_short_sat(rint( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ));
        itab1 = convert_short_sat(rint( (1.0f-taby)*tabx * INTER_REMAP_COEF_SCALE ));
        itab2 = convert_short_sat(rint( taby*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ));
        itab3 = convert_short_sat(rint( taby*tabx * INTER_REMAP_COEF_SCALE ));
-        
+
        int4 val;
        val = v0 * itab0 +  v1 * itab1 + v2 * itab2 + v3 * itab3;
-            
+
        if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
            dst[(dst_offset>>2)+dy*(dstStep>>2)+dx] =  convert_uchar4_sat ( (val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ;
    }
 }

-__kernel void warpPerspectiveCubic_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, 
-							int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
-							int dstStep, int src_offset, int dst_offset,  __constant F * M, int threadCols )
+__kernel void warpPerspectiveCubic_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst,
+                            int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
+                            int dstStep, int src_offset, int dst_offset,  __constant F * M, int threadCols )
 {
    int dx = get_global_id(0);
    int dy = get_global_id(1);
-    
+
    if( dx < threadCols && dy < dst_rows)
    {
        src_offset = (src_offset>>2);
-        srcStep = (srcStep>>2); 
+        srcStep = (srcStep>>2);
        dst_offset = (dst_offset>>2);
-        dstStep = (dstStep>>2); 
-        
+        dstStep = (dstStep>>2);
+
        F X0 = M[0]*dx + M[1]*dy + M[2];
        F Y0 = M[3]*dx + M[4]*dy + M[5];
        F W = M[6]*dx + M[7]*dy + M[8];
        W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
        int X = rint(X0*W);
        int Y = rint(Y0*W);
-        
+
        short sx = (short)(X >> INTER_BITS) - 1;
        short sy = (short)(Y >> INTER_BITS) - 1;
        short ay = (short)(Y & (INTER_TAB_SIZE-1));
        short ax = (short)(X & (INTER_TAB_SIZE-1));
-        
+
        uchar4 v[16];
        int i,j;
 #pragma unroll 4
@@ -365,7 +365,7 @@ __kernel void warpPerspectiveCubic_C4_D0(__global uchar4 const * restrict src, _
        interpolateCubic(ayy, tab1y);
        interpolateCubic(axx, tab1x);
        int isum = 0;
-        
+
 #pragma unroll 16
        for( i=0; i<16; i++ )
        {
@@ -380,17 +380,17 @@ __kernel void warpPerspectiveCubic_C4_D0(__global uchar4 const * restrict src, _
            int k1, k2;
            int diff = isum - INTER_REMAP_COEF_SCALE;
            int Mk1=2, Mk2=2, mk1=2, mk2=2;
-            
+
               for( k1 = 2; k1 < 4; k1++ )
                for( k2 = 2; k2 < 4; k2++ )
                {
-                    
+
                    if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] )
                        mk1 = k1, mk2 = k2;
                    else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] )
                         Mk1 = k1, Mk2 = k2;
                }
-                
+
            diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff));
        }

@@ -411,12 +411,12 @@ __kernel void warpPerspectiveCubic_C4_D0(__global uchar4 const * restrict src, _
 ***********************************************************************************************/

 __kernel void warpPerspectiveNN_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
-                            int dst_cols, int dst_rows, int srcStep, int dstStep, 
+                            int dst_cols, int dst_rows, int srcStep, int dstStep,
                            int src_offset, int dst_offset,  __constant F * M, int threadCols )
 {
    int dx = get_global_id(0);
    int dy = get_global_id(1);
-     
+
    if( dx < threadCols && dy < dst_rows)
    {
        F X0 = M[0]*dx + M[1]*dy + M[2];
@@ -429,33 +429,33 @@ __kernel void warpPerspectiveNN_C1_D5(__global float * src, __global float * dst
        short sy = (short)Y;

        if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
-            dst[(dst_offset>>2)+dy*dstStep+dx]= (sx>=0 && sx<src_cols && sy>=0 && sy<src_rows) ? src[(src_offset>>2)+sy*srcStep+sx] : 0; 
+            dst[(dst_offset>>2)+dy*dstStep+dx]= (sx>=0 && sx<src_cols && sy>=0 && sy<src_rows) ? src[(src_offset>>2)+sy*srcStep+sx] : 0;
    }
 }

 __kernel void warpPerspectiveLinear_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
-                            int dst_cols, int dst_rows, int srcStep, int dstStep, 
+                            int dst_cols, int dst_rows, int srcStep, int dstStep,
                            int src_offset, int dst_offset,  __constant F * M, int threadCols )
 {
    int dx = get_global_id(0);
    int dy = get_global_id(1);
-    
+
    if( dx < threadCols && dy < dst_rows)
    {
        src_offset = (src_offset>>2);
-         
+
        F X0 = M[0]*dx + M[1]*dy + M[2];
        F Y0 = M[3]*dx + M[4]*dy + M[5];
        F W = M[6]*dx + M[7]*dy + M[8];
        W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
        int X = rint(X0*W);
        int Y = rint(Y0*W);
-        
+
        short sx = (short)(X >> INTER_BITS);
        short sy = (short)(Y >> INTER_BITS);
        short ay = (short)(Y & (INTER_TAB_SIZE-1));
        short ax = (short)(X & (INTER_TAB_SIZE-1));
-        
+
        float v0, v1, v2, v3;

        v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? src[src_offset+sy * srcStep + sx] : 0;
@@ -469,38 +469,38 @@ __kernel void warpPerspectiveLinear_C1_D5(__global float * src, __global float *
        taby[1] = 1.f/INTER_TAB_SIZE*ay;
        tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax;
        tabx[1] = 1.f/INTER_TAB_SIZE*ax;
-       
+
        tab[0] = taby[0] * tabx[0];
        tab[1] = taby[0] * tabx[1];
        tab[2] = taby[1] * tabx[0];
        tab[3] = taby[1] * tabx[1];

        float sum = 0;
-        sum += v0 * tab[0] +  v1 * tab[1] +  v2 * tab[2] +  v3 * tab[3]; 
+        sum += v0 * tab[0] +  v1 * tab[1] +  v2 * tab[2] +  v3 * tab[3];
        if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
            dst[(dst_offset>>2)+dy*dstStep+dx] = sum;
    }
 }
-    
+
 __kernel void warpPerspectiveCubic_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
-                            int dst_cols, int dst_rows, int srcStep, int dstStep, 
+                            int dst_cols, int dst_rows, int srcStep, int dstStep,
                            int src_offset, int dst_offset,  __constant F * M, int threadCols )
 {
    int dx = get_global_id(0);
    int dy = get_global_id(1);
-    
+
    if( dx < threadCols && dy < dst_rows)
    {
        src_offset = (src_offset>>2);
        dst_offset = (dst_offset>>2);
-         
+
        F X0 = M[0]*dx + M[1]*dy + M[2];
        F Y0 = M[3]*dx + M[4]*dy + M[5];
        F W = M[6]*dx + M[7]*dy + M[8];
        W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
        int X = rint(X0*W);
        int Y = rint(Y0*W);
-        
+
        short sx = (short)(X >> INTER_BITS) - 1;
        short sy = (short)(Y >> INTER_BITS) - 1;
        short ay = (short)(Y & (INTER_TAB_SIZE-1));
@@ -526,7 +526,7 @@ __kernel void warpPerspectiveCubic_C1_D5(__global float * src, __global float *
        {
            tab[i] = tab1y[(i>>2)] * tab1x[(i&3)];
        }
-        
+
        if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
        {
            float sum = 0;
@@ -546,12 +546,12 @@ __kernel void warpPerspectiveCubic_C1_D5(__global float * src, __global float *
 ***********************************************************************************************/

 __kernel void warpPerspectiveNN_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows,
-                            int dst_cols, int dst_rows, int srcStep, int dstStep, 
+                            int dst_cols, int dst_rows, int srcStep, int dstStep,
                            int src_offset, int dst_offset,  __constant F * M, int threadCols )
 {
    int dx = get_global_id(0);
    int dy = get_global_id(1);
-        
+
    if( dx < threadCols && dy < dst_rows)
    {
        F X0 = M[0]*dx + M[1]*dy + M[2];
@@ -562,39 +562,39 @@ __kernel void warpPerspectiveNN_C4_D5(__global float4 * src, __global float4 * d
        int Y = rint(Y0*W);
        short sx = (short)X;
        short sy = (short)Y;
-        
+
        if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
-            dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx>=0 && sx<src_cols && sy>=0 && sy<src_rows) ? src[(src_offset>>4)+sy*(srcStep>>2)+sx] : 0; 
+            dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx>=0 && sx<src_cols && sy>=0 && sy<src_rows) ? src[(src_offset>>4)+sy*(srcStep>>2)+sx] : 0;
    }
 }

 __kernel void warpPerspectiveLinear_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows,
-                            int dst_cols, int dst_rows, int srcStep, int dstStep, 
+                            int dst_cols, int dst_rows, int srcStep, int dstStep,
                            int src_offset, int dst_offset,  __constant F * M, int threadCols )
 {
    int dx = get_global_id(0);
    int dy = get_global_id(1);
-   
+
    if( dx < threadCols && dy < dst_rows)
    {
        src_offset = (src_offset>>4);
        dst_offset = (dst_offset>>4);
        srcStep = (srcStep>>2);
        dstStep = (dstStep>>2);
-            
+
        F X0 = M[0]*dx + M[1]*dy + M[2];
        F Y0 = M[3]*dx + M[4]*dy + M[5];
        F W = M[6]*dx + M[7]*dy + M[8];
        W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
        int X = rint(X0*W);
        int Y = rint(Y0*W);
-        
+
        short sx0 = (short)(X >> INTER_BITS);
        short sy0 = (short)(Y >> INTER_BITS);
        short ay0 = (short)(Y & (INTER_TAB_SIZE-1));
        short ax0 = (short)(X & (INTER_TAB_SIZE-1));
-     
-        
+
+
        float4 v0, v1, v2, v3;

        v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : 0;
@@ -608,46 +608,46 @@ __kernel void warpPerspectiveLinear_C4_D5(__global float4 * src, __global float4
        taby[1] = 1.f/INTER_TAB_SIZE*ay0;
        tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax0;
        tabx[1] = 1.f/INTER_TAB_SIZE*ax0;
-       
+
        tab[0] = taby[0] * tabx[0];
        tab[1] = taby[0] * tabx[1];
        tab[2] = taby[1] * tabx[0];
        tab[3] = taby[1] * tabx[1];

        float4 sum = 0;
-        sum += v0 * tab[0] +  v1 * tab[1] +  v2 * tab[2] +  v3 * tab[3]; 
+        sum += v0 * tab[0] +  v1 * tab[1] +  v2 * tab[2] +  v3 * tab[3];
        if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
            dst[dst_offset+dy*dstStep+dx] = sum;
    }
 }
-    
-__kernel void warpPerspectiveCubic_C4_D5(__global float4 * src, __global float4 * dst, 
+
+__kernel void warpPerspectiveCubic_C4_D5(__global float4 * src, __global float4 * dst,
                            int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
-							int dstStep, int src_offset, int dst_offset,  __constant F * M, int threadCols )
+                            int dstStep, int src_offset, int dst_offset,  __constant F * M, int threadCols )
 {
    int dx = get_global_id(0);
    int dy = get_global_id(1);
-    
+
    if( dx < threadCols && dy < dst_rows )
    {
        src_offset = (src_offset>>4);
        dst_offset = (dst_offset>>4);
        srcStep = (srcStep>>2);
        dstStep = (dstStep>>2);
-            
+
        F X0 = M[0]*dx + M[1]*dy + M[2];
        F Y0 = M[3]*dx + M[4]*dy + M[5];
        F W = M[6]*dx + M[7]*dy + M[8];
        W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
        int X = rint(X0*W);
        int Y = rint(Y0*W);
-        
+
        short sx = (short)(X >> INTER_BITS)-1;
        short sy = (short)(Y >> INTER_BITS)-1;
        short ay = (short)(Y & (INTER_TAB_SIZE-1));
        short ax = (short)(X & (INTER_TAB_SIZE-1));
-     
-        
+
+
        float4 v[16];
        int i;

@@ -668,7 +668,7 @@ __kernel void warpPerspectiveCubic_C4_D5(__global float4 * src, __global float4
        {
            tab[i] = tab1y[(i>>2)] * tab1x[(i&3)];
        }
-        
+
        if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
        {
            float4 sum = 0;
--- a/modules/ocl/src/kernels/interpolate_frames.cl
+++ b/modules/ocl/src/kernels/interpolate_frames.cl
@@ -1,252 +1,252 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Peng Xiao, pengxiao@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other oclMaterials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
-#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
-
-// Image read mode
-__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;
-
-// atomic add for 32bit floating point
-inline void atomic_addf(volatile __global float *source, const float operand) {
-    union {
-        unsigned int intVal;
-        float floatVal;
-    } newVal;
-    union {
-        unsigned int intVal;
-        float floatVal;
-    } prevVal;
-    do {
-        prevVal.floatVal = *source;
-        newVal.floatVal = prevVal.floatVal + operand;
-    } while (atomic_cmpxchg((volatile __global unsigned int *)source, prevVal.intVal, newVal.intVal) != prevVal.intVal);
-}
-
-__kernel void memsetKernel(
-    float val,
-    __global float * image,
-    int width,
-    int height,
-    int step, // in element
-    int offset
-    )
-{
-    if(get_global_id(0) >= width || get_global_id(1) >= height)
-    {
-        return;
-    }
-    image += offset;
-    image[get_global_id(0) + get_global_id(1) * step] = val;
-}
-
-__kernel void normalizeKernel(
-    __global float * buffer,
-    int width,
-    int height,
-    int step,
-    int f_offset,
-    int d_offset
-    )
-{
-    __global float * factors = buffer + f_offset;
-    __global float * dst     = buffer + d_offset;
-
-    int j = get_global_id(0);
-    int i = get_global_id(1);
-
-    if(j >= width || i >= height)
-    {
-        return;
-    }
-    float scale = factors[step * i + j];
-    float invScale = (scale == 0.0f) ? 1.0f : (1.0f / scale);
-
-    dst[step * i + j] *= invScale;
-}
-
-__kernel void forwardWarpKernel(
-    __global const float * src,
-    __global float * buffer,
-    __global const float * u,
-    __global const float * v,
-    const int w,
-    const int h,
-    const int flow_stride,
-    const int image_stride,
-    const int factor_offset,
-    const int dst_offset,
-    const float time_scale
-    )
-{
-    int j = get_global_id(0);
-    int i = get_global_id(1);
-
-    if (i >= h || j >= w) return;
-
-    volatile __global float * normalization_factor = (volatile __global float *) buffer + factor_offset;
-    volatile __global float * dst = (volatile __global float *)buffer + dst_offset;
-
-    int flow_row_offset  = i * flow_stride;
-    int image_row_offset = i * image_stride;
-
-    //bottom left corner of a target pixel
-    float cx = u[flow_row_offset + j] * time_scale + (float)j + 1.0f;
-    float cy = v[flow_row_offset + j] * time_scale + (float)i + 1.0f;
-    // pixel containing bottom left corner
-    float px;
-    float py;
-    float dx = modf(cx, &px);
-    float dy = modf(cy, &py);
-    // target pixel integer coords
-    int tx;
-    int ty;
-    tx = (int) px;
-    ty = (int) py;
-    float value = src[image_row_offset + j];
-    float weight;
-    // fill pixel containing bottom right corner
-    if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
-    {
-        weight = dx * dy;
-        atomic_addf(dst + ty * image_stride + tx, value * weight);
-        atomic_addf(normalization_factor + ty * image_stride + tx, weight);
-    }
-
-    // fill pixel containing bottom left corner
-    tx -= 1;
-    if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
-    {
-        weight = (1.0f - dx) * dy;
-        atomic_addf(dst + ty * image_stride + tx, value * weight);
-        atomic_addf(normalization_factor + ty * image_stride + tx, weight);
-    }
-
-    // fill pixel containing upper left corner
-    ty -= 1;
-    if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
-    {
-        weight = (1.0f - dx) * (1.0f - dy);
-        atomic_addf(dst + ty * image_stride + tx, value * weight);
-        atomic_addf(normalization_factor + ty * image_stride + tx, weight);
-    }
-
-    // fill pixel containing upper right corner
-    tx += 1;
-    if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
-    {
-        weight = dx * (1.0f - dy);
-        atomic_addf(dst + ty * image_stride + tx, value * weight);
-        atomic_addf(normalization_factor + ty * image_stride + tx, weight);
-    }
-}
-
-// define buffer offsets
-enum
-{
-    O0_OS = 0,
-    O1_OS,
-    U_OS,
-    V_OS,
-    UR_OS,
-    VR_OS
-};
-
-__kernel void blendFramesKernel(
-    image2d_t tex_src0,
-    image2d_t tex_src1,
-    __global float * buffer,
-    __global float * out,
-    int w,
-    int h,
-    int step,
-    float theta
-    )
-{
-    __global float * u  = buffer + h * step * U_OS;
-    __global float * v  = buffer + h * step * V_OS;
-    __global float * ur = buffer + h * step * UR_OS;
-    __global float * vr = buffer + h * step * VR_OS;
-    __global float * o0 = buffer + h * step * O0_OS;
-    __global float * o1 = buffer + h * step * O1_OS;
-
-    int ix = get_global_id(0);
-    int iy = get_global_id(1);
-
-    if(ix >= w || iy >= h) return;
-
-    int pos = ix + step * iy;
-
-    float _u  = u[pos];
-    float _v  = v[pos];
-
-    float _ur = ur[pos];
-    float _vr = vr[pos];
-
-    float x = (float)ix + 0.5f;
-    float y = (float)iy + 0.5f;
-    bool b0 = o0[pos] > 1e-4f;
-    bool b1 = o1[pos] > 1e-4f;
-
-    float2 coord0 = (float2)(x - _u * theta, y - _v * theta);
-    float2 coord1 = (float2)(x + _u * (1.0f - theta), y + _v * (1.0f - theta));
-
-    if (b0 && b1)
-    {
-        // pixel is visible on both frames
-        out[pos] = read_imagef(tex_src0, sampler, coord0).x * (1.0f - theta) + 
-            read_imagef(tex_src1, sampler, coord1).x * theta;
-    }
-    else if (b0)
-    {
-        // visible on the first frame only
-        out[pos] = read_imagef(tex_src0, sampler, coord0).x;
-    }
-    else
-    {
-        // visible on the second frame only
-        out[pos] = read_imagef(tex_src1, sampler, coord1).x;
-    }
-}
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Peng Xiao, pengxiao@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+
+// Image read mode
+__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;
+
+// atomic add for 32bit floating point
+inline void atomic_addf(volatile __global float *source, const float operand) {
+    union {
+        unsigned int intVal;
+        float floatVal;
+    } newVal;
+    union {
+        unsigned int intVal;
+        float floatVal;
+    } prevVal;
+    do {
+        prevVal.floatVal = *source;
+        newVal.floatVal = prevVal.floatVal + operand;
+    } while (atomic_cmpxchg((volatile __global unsigned int *)source, prevVal.intVal, newVal.intVal) != prevVal.intVal);
+}
+
+__kernel void memsetKernel(
+    float val,
+    __global float * image,
+    int width,
+    int height,
+    int step, // in element
+    int offset
+    )
+{
+    if(get_global_id(0) >= width || get_global_id(1) >= height)
+    {
+        return;
+    }
+    image += offset;
+    image[get_global_id(0) + get_global_id(1) * step] = val;
+}
+
+__kernel void normalizeKernel(
+    __global float * buffer,
+    int width,
+    int height,
+    int step,
+    int f_offset,
+    int d_offset
+    )
+{
+    __global float * factors = buffer + f_offset;
+    __global float * dst     = buffer + d_offset;
+
+    int j = get_global_id(0);
+    int i = get_global_id(1);
+
+    if(j >= width || i >= height)
+    {
+        return;
+    }
+    float scale = factors[step * i + j];
+    float invScale = (scale == 0.0f) ? 1.0f : (1.0f / scale);
+
+    dst[step * i + j] *= invScale;
+}
+
+__kernel void forwardWarpKernel(
+    __global const float * src,
+    __global float * buffer,
+    __global const float * u,
+    __global const float * v,
+    const int w,
+    const int h,
+    const int flow_stride,
+    const int image_stride,
+    const int factor_offset,
+    const int dst_offset,
+    const float time_scale
+    )
+{
+    int j = get_global_id(0);
+    int i = get_global_id(1);
+
+    if (i >= h || j >= w) return;
+
+    volatile __global float * normalization_factor = (volatile __global float *) buffer + factor_offset;
+    volatile __global float * dst = (volatile __global float *)buffer + dst_offset;
+
+    int flow_row_offset  = i * flow_stride;
+    int image_row_offset = i * image_stride;
+
+    //bottom left corner of a target pixel
+    float cx = u[flow_row_offset + j] * time_scale + (float)j + 1.0f;
+    float cy = v[flow_row_offset + j] * time_scale + (float)i + 1.0f;
+    // pixel containing bottom left corner
+    float px;
+    float py;
+    float dx = modf(cx, &px);
+    float dy = modf(cy, &py);
+    // target pixel integer coords
+    int tx;
+    int ty;
+    tx = (int) px;
+    ty = (int) py;
+    float value = src[image_row_offset + j];
+    float weight;
+    // fill pixel containing bottom right corner
+    if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
+    {
+        weight = dx * dy;
+        atomic_addf(dst + ty * image_stride + tx, value * weight);
+        atomic_addf(normalization_factor + ty * image_stride + tx, weight);
+    }
+
+    // fill pixel containing bottom left corner
+    tx -= 1;
+    if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
+    {
+        weight = (1.0f - dx) * dy;
+        atomic_addf(dst + ty * image_stride + tx, value * weight);
+        atomic_addf(normalization_factor + ty * image_stride + tx, weight);
+    }
+
+    // fill pixel containing upper left corner
+    ty -= 1;
+    if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
+    {
+        weight = (1.0f - dx) * (1.0f - dy);
+        atomic_addf(dst + ty * image_stride + tx, value * weight);
+        atomic_addf(normalization_factor + ty * image_stride + tx, weight);
+    }
+
+    // fill pixel containing upper right corner
+    tx += 1;
+    if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
+    {
+        weight = dx * (1.0f - dy);
+        atomic_addf(dst + ty * image_stride + tx, value * weight);
+        atomic_addf(normalization_factor + ty * image_stride + tx, weight);
+    }
+}
+
+// define buffer offsets
+enum
+{
+    O0_OS = 0,
+    O1_OS,
+    U_OS,
+    V_OS,
+    UR_OS,
+    VR_OS
+};
+
+__kernel void blendFramesKernel(
+    image2d_t tex_src0,
+    image2d_t tex_src1,
+    __global float * buffer,
+    __global float * out,
+    int w,
+    int h,
+    int step,
+    float theta
+    )
+{
+    __global float * u  = buffer + h * step * U_OS;
+    __global float * v  = buffer + h * step * V_OS;
+    __global float * ur = buffer + h * step * UR_OS;
+    __global float * vr = buffer + h * step * VR_OS;
+    __global float * o0 = buffer + h * step * O0_OS;
+    __global float * o1 = buffer + h * step * O1_OS;
+
+    int ix = get_global_id(0);
+    int iy = get_global_id(1);
+
+    if(ix >= w || iy >= h) return;
+
+    int pos = ix + step * iy;
+
+    float _u  = u[pos];
+    float _v  = v[pos];
+
+    float _ur = ur[pos];
+    float _vr = vr[pos];
+
+    float x = (float)ix + 0.5f;
+    float y = (float)iy + 0.5f;
+    bool b0 = o0[pos] > 1e-4f;
+    bool b1 = o1[pos] > 1e-4f;
+
+    float2 coord0 = (float2)(x - _u * theta, y - _v * theta);
+    float2 coord1 = (float2)(x + _u * (1.0f - theta), y + _v * (1.0f - theta));
+
+    if (b0 && b1)
+    {
+        // pixel is visible on both frames
+        out[pos] = read_imagef(tex_src0, sampler, coord0).x * (1.0f - theta) +
+            read_imagef(tex_src1, sampler, coord1).x * theta;
+    }
+    else if (b0)
+    {
+        // visible on the first frame only
+        out[pos] = read_imagef(tex_src0, sampler, coord0).x;
+    }
+    else
+    {
+        // visible on the second frame only
+        out[pos] = read_imagef(tex_src1, sampler, coord1).x;
+    }
+}
--- a/modules/ocl/src/kernels/match_template.cl
+++ b/modules/ocl/src/kernels/match_template.cl
--- a/modules/ocl/src/kernels/meanShift.cl
+++ b/modules/ocl/src/kernels/meanShift.cl
@@ -50,8 +50,8 @@ typedef double F;
 typedef float F;
 #endif

-short2 do_mean_shift(int x0, int y0, __global uchar4* out,int out_step, 
-               __global uchar4* in, int in_step, int dst_off, int src_off, 
+short2 do_mean_shift(int x0, int y0, __global uchar4* out,int out_step,
+               __global uchar4* in, int in_step, int dst_off, int src_off,
               int cols, int rows, int sp, int sr, int maxIter, float eps)
 {
    int isr2 = sr*sr;
@@ -81,9 +81,9 @@ short2 do_mean_shift(int x0, int y0, __global uchar4* out,int out_step,
        for( int y = miny; y <= maxy; y++)
        {
            int rowCount = 0;
-            int x = minx; 
+            int x = minx;
            for( ; x+3 <= maxx; x+=4 )
-            {                    
+            {
                int id = src_off + y*in_step + x;
                uchar16 t = (uchar16)(in[id],in[id+1],in[id+2],in[id+3]);
                int norm2_1 = (t.s0 - c.x) * (t.s0 - c.x) + (t.s1 - c.y) * (t.s1 - c.y) +
@@ -126,7 +126,7 @@ short2 do_mean_shift(int x0, int y0, __global uchar4* out,int out_step,
                    s.x += t.s0; s.y += t.s1; s.z += t.s2;
                    sx += x; rowCount++;
                }
-                
+
            }
            if(x+1 == maxx)
            {
@@ -213,32 +213,32 @@ short2 do_mean_shift(int x0, int y0, __global uchar4* out,int out_step,
 }


-__kernel void meanshift_kernel(__global uchar4* out, int out_step, 
-                               __global uchar4* in, int in_step, 
+__kernel void meanshift_kernel(__global uchar4* out, int out_step,
+                               __global uchar4* in, int in_step,
                        int dst_off, int src_off, int cols, int rows,
                        int sp, int sr, int maxIter, float eps)
 {
-    int x0 = get_global_id(0); 
-    int y0 = get_global_id(1); 
+    int x0 = get_global_id(0);
+    int y0 = get_global_id(1);
    if( x0 < cols && y0 < rows )
        do_mean_shift(x0, y0, out, out_step, in, in_step, dst_off, src_off,
                          cols, rows, sp, sr, maxIter, eps);
 }

-__kernel void meanshiftproc_kernel( __global uchar4* in, __global uchar4* outr, 
-                             __global short2* outsp, int instep, int outrstep, 
+__kernel void meanshiftproc_kernel( __global uchar4* in, __global uchar4* outr,
+                             __global short2* outsp, int instep, int outrstep,
                             int outspstep, int in_off, int outr_off, int outsp_off,
                             int cols, int rows, int sp, int sr, int maxIter, float eps )
 {
-    int x0 = get_global_id(0); 
-    int y0 = get_global_id(1); 
+    int x0 = get_global_id(0);
+    int y0 = get_global_id(1);

    if( x0 < cols && y0 < rows )
    {
        //int basesp = (blockIdx.y * blockDim.y + threadIdx.y) * outspstep + (blockIdx.x * blockDim.x + threadIdx.x) * 2 * sizeof(short);
        //*(short2*)(outsp + basesp) = do_mean_shift(x0, y0, outr, outrstep, cols, rows, sp, sr, maxIter, eps);
        // we have ensured before that ((outspstep & 0x11)==0).
-        outsp_off >>= 2; 
+        outsp_off >>= 2;
        outspstep >>= 2;
        int basesp = outsp_off + y0 * outspstep + x0;
        outsp[basesp] = do_mean_shift(x0, y0, outr, outrstep, in, instep, outr_off, in_off, cols, rows, sp, sr, maxIter, eps);
--- a/modules/ocl/src/kernels/merge_mat.cl
+++ b/modules/ocl/src/kernels/merge_mat.cl
@@ -59,25 +59,25 @@ __kernel void merge_vector_C2_D0(__global uchar *mat_dst,  int dst_step,  int ds

 {
    int x = get_global_id(0);
-    int y = get_global_id(1); 
-  
+    int y = get_global_id(1);
+
    if ((x < cols) && (y < rows))
    {
        x = x << 1;

        #define dst_align  ((dst_offset & 3) >> 1)
-        int src0_index = mad24(y, src0_step, src0_offset + x - dst_align); 
-        int src1_index = mad24(y, src1_step, src1_offset + x - dst_align); 
+        int src0_index = mad24(y, src0_step, src0_offset + x - dst_align);
+        int src1_index = mad24(y, src1_step, src1_offset + x - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);

-        __global uchar4 * dst  = (__global uchar4 *)(mat_dst + dst_index); 
-        __global uchar  * src0 = mat_src0 + src0_index; 
-        __global uchar  * src1 = src0     + 1; 
-        __global uchar  * src2 = mat_src1 + src1_index; 
-        __global uchar  * src3 = src2     + 1; 
+        __global uchar4 * dst  = (__global uchar4 *)(mat_dst + dst_index);
+        __global uchar  * src0 = mat_src0 + src0_index;
+        __global uchar  * src1 = src0     + 1;
+        __global uchar  * src2 = mat_src1 + src1_index;
+        __global uchar  * src3 = src2     + 1;

        uchar4 dst_data = *dst;
        uchar  data_0   = *(src0);
@@ -87,8 +87,8 @@ __kernel void merge_vector_C2_D0(__global uchar *mat_dst,  int dst_step,  int ds

        uchar4 tmp_data = (uchar4)(data_0, data_2, data_1, data_3);

-        tmp_data.xy = dst_index + 0 >= dst_start ? tmp_data.xy : dst_data.xy; 
-        tmp_data.zw = dst_index + 2 <  dst_end   ? tmp_data.zw : dst_data.zw; 
+        tmp_data.xy = dst_index + 0 >= dst_start ? tmp_data.xy : dst_data.xy;
+        tmp_data.zw = dst_index + 2 <  dst_end   ? tmp_data.zw : dst_data.zw;

        *dst = tmp_data;
    }
@@ -100,25 +100,25 @@ __kernel void merge_vector_C2_D1(__global char *mat_dst,  int dst_step,  int dst

 {
    int x = get_global_id(0);
-    int y = get_global_id(1); 
-  
+    int y = get_global_id(1);
+
    if ((x < cols) && (y < rows))
    {
        x = x << 1;

        #define dst_align  ((dst_offset & 3) >> 1)
-        int src0_index = mad24(y, src0_step, src0_offset + x - dst_align); 
-        int src1_index = mad24(y, src1_step, src1_offset + x - dst_align); 
+        int src0_index = mad24(y, src0_step, src0_offset + x - dst_align);
+        int src1_index = mad24(y, src1_step, src1_offset + x - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);

-        __global char4 * dst  = (__global char4 *)(mat_dst + dst_index); 
-        __global char  * src0 = mat_src0 + src0_index; 
-        __global char  * src1 = src0     + 1; 
-        __global char  * src2 = mat_src1 + src1_index; 
-        __global char  * src3 = src2     + 1; 
+        __global char4 * dst  = (__global char4 *)(mat_dst + dst_index);
+        __global char  * src0 = mat_src0 + src0_index;
+        __global char  * src1 = src0     + 1;
+        __global char  * src2 = mat_src1 + src1_index;
+        __global char  * src3 = src2     + 1;

        char4 dst_data = *dst;
        char  data_0   = *(src0);
@@ -128,8 +128,8 @@ __kernel void merge_vector_C2_D1(__global char *mat_dst,  int dst_step,  int dst

        char4 tmp_data = (char4)(data_0, data_2, data_1, data_3);

-        tmp_data.xy = dst_index + 0 >= dst_start ? tmp_data.xy : dst_data.xy; 
-        tmp_data.zw = dst_index + 2 <  dst_end   ? tmp_data.zw : dst_data.zw; 
+        tmp_data.xy = dst_index + 0 >= dst_start ? tmp_data.xy : dst_data.xy;
+        tmp_data.zw = dst_index + 2 <  dst_end   ? tmp_data.zw : dst_data.zw;

        *dst = tmp_data;
    }
@@ -141,12 +141,12 @@ __kernel void merge_vector_C2_D2(__global ushort *mat_dst,  int dst_step,  int d

 {
    int x = get_global_id(0);
-    int y = get_global_id(1); 
-  
+    int y = get_global_id(1);
+
    if ((x < cols) && (y < rows))
    {
-        int src0_index = mad24(y, src0_step, src0_offset); 
-        int src1_index = mad24(y, src1_step, src1_offset); 
+        int src0_index = mad24(y, src0_step, src0_offset);
+        int src1_index = mad24(y, src1_step, src1_offset);

        int dst_index  = mad24(y, dst_step , dst_offset);

@@ -167,12 +167,12 @@ __kernel void merge_vector_C2_D3(__global short *mat_dst,  int dst_step,  int ds
                                 int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
-    int y = get_global_id(1); 
-  
+    int y = get_global_id(1);
+
    if ((x < cols) && (y < rows))
    {
-        int src0_index = mad24(y, src0_step, src0_offset); 
-        int src1_index = mad24(y, src1_step, src1_offset); 
+        int src0_index = mad24(y, src0_step, src0_offset);
+        int src1_index = mad24(y, src1_step, src1_offset);

        int dst_index  = mad24(y, dst_step , dst_offset);

@@ -193,12 +193,12 @@ __kernel void merge_vector_C2_D4(__global int *mat_dst,  int dst_step,  int dst_
                                 int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
-    int y = get_global_id(1); 
-  
+    int y = get_global_id(1);
+
    if ((x < cols) && (y < rows))
    {
-        int src0_index = mad24(y, src0_step, src0_offset); 
-        int src1_index = mad24(y, src1_step, src1_offset); 
+        int src0_index = mad24(y, src0_step, src0_offset);
+        int src1_index = mad24(y, src1_step, src1_offset);
        int dst_index  = mad24(y, dst_step , dst_offset);

        int src0 = *((__global int *)((__global uchar *)mat_src0 + src0_index + (x << 2)));
@@ -213,12 +213,12 @@ __kernel void merge_vector_C2_D5(__global float *mat_dst,  int dst_step,  int ds
                                 int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
-    int y = get_global_id(1); 
-  
+    int y = get_global_id(1);
+
    if ((x < cols) && (y < rows))
    {
-        int src0_index = mad24(y, src0_step, src0_offset); 
-        int src1_index = mad24(y, src1_step, src1_offset); 
+        int src0_index = mad24(y, src0_step, src0_offset);
+        int src1_index = mad24(y, src1_step, src1_offset);
        int dst_index  = mad24(y, dst_step , dst_offset);

        float src0 = *((__global float *)((__global uchar *)mat_src0 + src0_index + (x << 2)));
@@ -235,12 +235,12 @@ __kernel void merge_vector_C2_D6(__global double *mat_dst,  int dst_step,  int d
                                 int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
-    int y = get_global_id(1); 
-  
+    int y = get_global_id(1);
+
    if ((x < cols) && (y < rows))
    {
-        int src0_index = mad24(y, src0_step, src0_offset); 
-        int src1_index = mad24(y, src1_step, src1_offset); 
+        int src0_index = mad24(y, src0_step, src0_offset);
+        int src1_index = mad24(y, src1_step, src1_offset);
        int dst_index  = mad24(y, dst_step , dst_offset);

        double src0 = *((__global double *)((__global uchar *)mat_src0 + src0_index + (x << 3)));
@@ -258,8 +258,8 @@ __kernel void merge_vector_C3_D0(__global uchar *mat_dst,  int dst_step,  int ds
                                 int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
-    int y = get_global_id(1); 
-  
+    int y = get_global_id(1);
+
    if ((x < cols) && (y < rows))
    {
        x = x << 2;
@@ -268,8 +268,8 @@ __kernel void merge_vector_C3_D0(__global uchar *mat_dst,  int dst_step,  int ds
        int src1_index = mad24(y, src1_step, x + src1_offset - offset_cols);
        int src2_index = mad24(y, src2_step, x + src2_offset - offset_cols);

-        int dst_start = mad24(y, dst_step, dst_offset); 
-        int dst_end   = mad24(y, dst_step, dst_offset + dst_step1); 
+        int dst_start = mad24(y, dst_step, dst_offset);
+        int dst_end   = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index = mad24(y, dst_step, dst_offset + 3 * x - offset_cols * 3);

        uchar data0_0 = *(mat_src0 + src0_index + 0);
@@ -322,8 +322,8 @@ __kernel void merge_vector_C3_D1(__global char *mat_dst,  int dst_step,  int dst
                                 int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
-    int y = get_global_id(1); 
-  
+    int y = get_global_id(1);
+
    if ((x < cols) && (y < rows))
    {
        x = x << 2;
@@ -332,8 +332,8 @@ __kernel void merge_vector_C3_D1(__global char *mat_dst,  int dst_step,  int dst
        int src1_index = mad24(y, src1_step, x + src1_offset - offset_cols);
        int src2_index = mad24(y, src2_step, x + src2_offset - offset_cols);

-        int dst_start = mad24(y, dst_step, dst_offset); 
-        int dst_end   = mad24(y, dst_step, dst_offset + dst_step1); 
+        int dst_start = mad24(y, dst_step, dst_offset);
+        int dst_end   = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index = mad24(y, dst_step, dst_offset + 3 * x - offset_cols * 3);

        char data0_0 = *(mat_src0 + src0_index + 0);
@@ -386,8 +386,8 @@ __kernel void merge_vector_C3_D2(__global ushort *mat_dst,  int dst_step,  int d
                                 int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
-    int y = get_global_id(1); 
-  
+    int y = get_global_id(1);
+
    if ((x < cols) && (y < rows))
    {
        x = x << 1;
@@ -396,8 +396,8 @@ __kernel void merge_vector_C3_D2(__global ushort *mat_dst,  int dst_step,  int d
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - offset_cols);
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - offset_cols);

-        int dst_start = mad24(y, dst_step, dst_offset); 
-        int dst_end   = mad24(y, dst_step, dst_offset + dst_step1); 
+        int dst_start = mad24(y, dst_step, dst_offset);
+        int dst_end   = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index = mad24(y, dst_step, dst_offset + 6 * x - offset_cols * 6);

        ushort data0_0 = *((__global ushort *)((__global char *)mat_src0 + src0_index + 0));
@@ -438,8 +438,8 @@ __kernel void merge_vector_C3_D3(__global short *mat_dst,  int dst_step,  int ds
                                 int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
-    int y = get_global_id(1); 
-  
+    int y = get_global_id(1);
+
    if ((x < cols) && (y < rows))
    {
        x = x << 1;
@@ -448,8 +448,8 @@ __kernel void merge_vector_C3_D3(__global short *mat_dst,  int dst_step,  int ds
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - offset_cols);
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - offset_cols);

-        int dst_start = mad24(y, dst_step, dst_offset); 
-        int dst_end   = mad24(y, dst_step, dst_offset + dst_step1); 
+        int dst_start = mad24(y, dst_step, dst_offset);
+        int dst_end   = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index = mad24(y, dst_step, dst_offset + 6 * x - offset_cols * 6);

        short data0_0 = *((__global short *)((__global char *)mat_src0 + src0_index + 0));
@@ -490,13 +490,13 @@ __kernel void merge_vector_C3_D4(__global int *mat_dst,  int dst_step,  int dst_
                                 int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
-    int y = get_global_id(1); 
-  
+    int y = get_global_id(1);
+
    if ((x < cols) && (y < rows))
    {
-        int src0_index = mad24(y, src0_step, src0_offset); 
-        int src1_index = mad24(y, src1_step, src1_offset); 
-        int src2_index = mad24(y, src2_step, src2_offset); 
+        int src0_index = mad24(y, src0_step, src0_offset);
+        int src1_index = mad24(y, src1_step, src1_offset);
+        int src2_index = mad24(y, src2_step, src2_offset);

        int dst_index  = mad24(y, dst_step , dst_offset);

@@ -524,13 +524,13 @@ __kernel void merge_vector_C3_D5(__global float *mat_dst,  int dst_step,  int ds
                                 int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
-    int y = get_global_id(1); 
-  
+    int y = get_global_id(1);
+
    if ((x < cols) && (y < rows))
    {
-        int src0_index = mad24(y, src0_step, src0_offset); 
-        int src1_index = mad24(y, src1_step, src1_offset); 
-        int src2_index = mad24(y, src2_step, src2_offset); 
+        int src0_index = mad24(y, src0_step, src0_offset);
+        int src1_index = mad24(y, src1_step, src1_offset);
+        int src2_index = mad24(y, src2_step, src2_offset);

        int dst_index  = mad24(y, dst_step , dst_offset);

@@ -560,13 +560,13 @@ __kernel void merge_vector_C3_D6(__global double *mat_dst,  int dst_step,  int d
                                 int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
-    int y = get_global_id(1); 
-  
+    int y = get_global_id(1);
+
    if ((x < cols) && (y < rows))
    {
-        int src0_index = mad24(y, src0_step, src0_offset); 
-        int src1_index = mad24(y, src1_step, src1_offset); 
-        int src2_index = mad24(y, src2_step, src2_offset); 
+        int src0_index = mad24(y, src0_step, src0_offset);
+        int src1_index = mad24(y, src1_step, src1_offset);
+        int src2_index = mad24(y, src2_step, src2_offset);

        int dst_index  = mad24(y, dst_step , dst_offset);

@@ -596,14 +596,14 @@ __kernel void merge_vector_C4_D0(__global uchar *mat_dst,  int dst_step,  int ds
                                 int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
-    int y = get_global_id(1); 
-  
+    int y = get_global_id(1);
+
    if ((x < cols) && (y < rows))
    {
-        int src0_index = mad24(y, src0_step, src0_offset); 
-        int src1_index = mad24(y, src1_step, src1_offset); 
-        int src2_index = mad24(y, src2_step, src2_offset); 
-        int src3_index = mad24(y, src3_step, src3_offset); 
+        int src0_index = mad24(y, src0_step, src0_offset);
+        int src1_index = mad24(y, src1_step, src1_offset);
+        int src2_index = mad24(y, src2_step, src2_offset);
+        int src3_index = mad24(y, src3_step, src3_offset);
        int dst_index  = mad24(y, dst_step , dst_offset);

        uchar src0 = *(mat_src0 + src0_index + x );
@@ -622,14 +622,14 @@ __kernel void merge_vector_C4_D1(__global char *mat_dst,  int dst_step,  int dst
                                 int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
-    int y = get_global_id(1); 
-  
+    int y = get_global_id(1);
+
    if ((x < cols) && (y < rows))
    {
-        int src0_index = mad24(y, src0_step, src0_offset); 
-        int src1_index = mad24(y, src1_step, src1_offset); 
-        int src2_index = mad24(y, src2_step, src2_offset); 
-        int src3_index = mad24(y, src3_step, src3_offset); 
+        int src0_index = mad24(y, src0_step, src0_offset);
+        int src1_index = mad24(y, src1_step, src1_offset);
+        int src2_index = mad24(y, src2_step, src2_offset);
+        int src3_index = mad24(y, src3_step, src3_offset);
        int dst_index  = mad24(y, dst_step , dst_offset);

        char src0 = *(mat_src0 + src0_index + x );
@@ -648,14 +648,14 @@ __kernel void merge_vector_C4_D2(__global ushort *mat_dst,  int dst_step,  int d
                                 int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
-    int y = get_global_id(1); 
-  
+    int y = get_global_id(1);
+
    if ((x < cols) && (y < rows))
    {
-        int src0_index = mad24(y, src0_step, src0_offset); 
-        int src1_index = mad24(y, src1_step, src1_offset); 
-        int src2_index = mad24(y, src2_step, src2_offset); 
-        int src3_index = mad24(y, src3_step, src3_offset); 
+        int src0_index = mad24(y, src0_step, src0_offset);
+        int src1_index = mad24(y, src1_step, src1_offset);
+        int src2_index = mad24(y, src2_step, src2_offset);
+        int src3_index = mad24(y, src3_step, src3_offset);
        int dst_index  = mad24(y, dst_step , dst_offset);

        ushort src0 = *((__global ushort *)((__global uchar *)mat_src0 + src0_index + (x << 1)));
@@ -674,14 +674,14 @@ __kernel void merge_vector_C4_D3(__global short *mat_dst,  int dst_step,  int ds
                                 int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
-    int y = get_global_id(1); 
-  
+    int y = get_global_id(1);
+
    if ((x < cols) && (y < rows))
    {
-        int src0_index = mad24(y, src0_step, src0_offset); 
-        int src1_index = mad24(y, src1_step, src1_offset); 
-        int src2_index = mad24(y, src2_step, src2_offset); 
-        int src3_index = mad24(y, src3_step, src3_offset); 
+        int src0_index = mad24(y, src0_step, src0_offset);
+        int src1_index = mad24(y, src1_step, src1_offset);
+        int src2_index = mad24(y, src2_step, src2_offset);
+        int src3_index = mad24(y, src3_step, src3_offset);
        int dst_index  = mad24(y, dst_step , dst_offset);

        short src0 = *((__global short *)((__global uchar *)mat_src0 + src0_index + (x << 1)));
@@ -700,14 +700,14 @@ __kernel void merge_vector_C4_D4(__global int *mat_dst,  int dst_step,  int dst_
                                 int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
-    int y = get_global_id(1); 
-  
+    int y = get_global_id(1);
+
    if ((x < cols) && (y < rows))
    {
-        int src0_index = mad24(y, src0_step, src0_offset); 
-        int src1_index = mad24(y, src1_step, src1_offset); 
-        int src2_index = mad24(y, src2_step, src2_offset); 
-        int src3_index = mad24(y, src3_step, src3_offset); 
+        int src0_index = mad24(y, src0_step, src0_offset);
+        int src1_index = mad24(y, src1_step, src1_offset);
+        int src2_index = mad24(y, src2_step, src2_offset);
+        int src3_index = mad24(y, src3_step, src3_offset);
        int dst_index  = mad24(y, dst_step , dst_offset);

        int src0 = *((__global int *)((__global uchar *)mat_src0 + src0_index + (x << 2)));
@@ -726,14 +726,14 @@ __kernel void merge_vector_C4_D5(__global float *mat_dst,  int dst_step,  int ds
                                 int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
-    int y = get_global_id(1); 
-  
+    int y = get_global_id(1);
+
    if ((x < cols) && (y < rows))
    {
-        int src0_index = mad24(y, src0_step, src0_offset); 
-        int src1_index = mad24(y, src1_step, src1_offset); 
-        int src2_index = mad24(y, src2_step, src2_offset); 
-        int src3_index = mad24(y, src3_step, src3_offset); 
+        int src0_index = mad24(y, src0_step, src0_offset);
+        int src1_index = mad24(y, src1_step, src1_offset);
+        int src2_index = mad24(y, src2_step, src2_offset);
+        int src3_index = mad24(y, src3_step, src3_offset);
        int dst_index  = mad24(y, dst_step , dst_offset);

        float src0 = *((__global float *)((__global uchar *)mat_src0 + src0_index + (x << 2)));
@@ -754,14 +754,14 @@ __kernel void merge_vector_C4_D6(__global double *mat_dst,  int dst_step,  int d
                                 int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
-    int y = get_global_id(1); 
-  
+    int y = get_global_id(1);
+
    if ((x < cols) && (y < rows))
    {
-        int src0_index = mad24(y, src0_step, src0_offset); 
-        int src1_index = mad24(y, src1_step, src1_offset); 
-        int src2_index = mad24(y, src2_step, src2_offset); 
-        int src3_index = mad24(y, src3_step, src3_offset); 
+        int src0_index = mad24(y, src0_step, src0_offset);
+        int src1_index = mad24(y, src1_step, src1_offset);
+        int src2_index = mad24(y, src2_step, src2_offset);
+        int src3_index = mad24(y, src3_step, src3_offset);
        int dst_index  = mad24(y, dst_step , dst_offset);

        double src0 = *((__global double *)((__global uchar *)mat_src0 + src0_index + (x << 3)));
@@ -783,8 +783,8 @@ __kernel void merge_vector_C2_D0_1(int rows, int cols,
                                   __global uchar *mat_src1, int src1_step)
 {
    int x = get_global_id(0);
-    int y = get_global_id(1); 
-  
+    int y = get_global_id(1);
+
    if ((x < cols) && (y < rows))
    {
        __global uchar4  *src0_y = (__global uchar4 * )(mat_src0 + y * src0_step);
@@ -807,8 +807,8 @@ __kernel void merge_vector_C2_D1_1(int rows, int cols,
                                   __global char *mat_src1, int src1_step)
 {
    int x = get_global_id(0);
-    int y = get_global_id(1); 
-  
+    int y = get_global_id(1);
+
    if ((x < cols) && (y < rows))
    {
        __global char4  *src0_y = (__global char4 * )(mat_src0 + y * src0_step);
@@ -831,8 +831,8 @@ __kernel void merge_vector_C2_D2_1(int rows, int cols,
                                   __global ushort *mat_src1, int src1_step)
 {
    int x = get_global_id(0);
-    int y = get_global_id(1); 
-  
+    int y = get_global_id(1);
+
    if ((x < cols) && (y < rows))
    {
        __global ushort2  *src0_y = (__global ushort2 *)((__global uchar *)mat_src0 + y * src0_step);
@@ -855,8 +855,8 @@ __kernel void merge_vector_C2_D3_1(int rows, int cols,
                                   __global short *mat_src1, int src1_step)
 {
    int x = get_global_id(0);
-    int y = get_global_id(1); 
-  
+    int y = get_global_id(1);
+
    if ((x < cols) && (y < rows))
    {
        __global short2  *src0_y = (__global short2 *)((__global uchar *)mat_src0 + y * src0_step);
@@ -880,8 +880,8 @@ __kernel void merge_vector_C2_D4_1(int rows, int cols,
                                   __global int *mat_src1, int src1_step)
 {
    int x = get_global_id(0);
-    int y = get_global_id(1); 
-  
+    int y = get_global_id(1);
+
    if ((x < cols) && (y < rows))
    {
        __global int  *src0_y = (__global int *)((__global uchar *)mat_src0 + y * src0_step);
@@ -904,8 +904,8 @@ __kernel void merge_vector_C2_D5_1(int rows, int cols,
                                   __global float *mat_src1, int src1_step)
 {
    int x = get_global_id(0);
-    int y = get_global_id(1); 
-  
+    int y = get_global_id(1);
+
    if ((x < cols) && (y < rows))
    {
        __global float  *src0_y = (__global float *)((__global uchar *)mat_src0 + y * src0_step);
@@ -915,7 +915,7 @@ __kernel void merge_vector_C2_D5_1(int rows, int cols,
        float value1 = src0_y[x];
        float value2 = src1_y[x];

-        dst_y[x] = (float2)(value1, value2); 
+        dst_y[x] = (float2)(value1, value2);
    }
 }

@@ -926,8 +926,8 @@ __kernel void merge_vector_C2_D6_1(int rows, int cols,
                                   __global double *mat_src1, int src1_step)
 {
    int x = get_global_id(0);
-    int y = get_global_id(1); 
-  
+    int y = get_global_id(1);
+
    if ((x < cols) && (y < rows))
    {
        __global double  *src0_y = (__global double *)((__global uchar *)mat_src0 + y * src0_step);
@@ -949,8 +949,8 @@ __kernel void merge_vector_C3_D0_1(int rows, int cols,
                                   __global uchar *mat_src2, int src2_step)
 {
    int x = get_global_id(0);
-    int y = get_global_id(1); 
-  
+    int y = get_global_id(1);
+
    if ((x < cols) && (y < rows))
    {
        __global uchar4  *src0_y = (__global uchar4 * )(mat_src0 + y * src0_step);
@@ -981,8 +981,8 @@ __kernel void merge_vector_C3_D1_1(int rows, int cols,
                                   __global char *mat_src2, int src2_step)
 {
    int x = get_global_id(0);
-    int y = get_global_id(1); 
-  
+    int y = get_global_id(1);
+
    if ((x < cols) && (y < rows))
    {
        __global char4  *src0_y = (__global char4 * )(mat_src0 + y * src0_step);
@@ -1027,8 +1027,8 @@ __kernel void merge_vector_C3_D2_1(int rows, int cols,
                                   __global ushort *mat_src2, int src2_step)
 {
    int x = get_global_id(0);
-    int y = get_global_id(1); 
-  
+    int y = get_global_id(1);
+
    if ((x < cols) && (y < rows))
    {
        __global ushort2  *src0_y = (__global ushort2 * )((__global char *)mat_src0 + y * src0_step);
@@ -1054,8 +1054,8 @@ __kernel void merge_vector_C3_D3_1(int rows, int cols,
                                   __global short *mat_src2, int src2_step)
 {
    int x = get_global_id(0);
-    int y = get_global_id(1); 
-  
+    int y = get_global_id(1);
+
    if ((x < cols) && (y < rows))
    {
        __global short2  *src0_y = (__global short2 * )((__global char *)mat_src0 + y * src0_step);
@@ -1091,8 +1091,8 @@ __kernel void merge_vector_C3_D4_1(int rows, int cols,
                                   __global int *mat_src2, int src2_step)
 {
    int x = get_global_id(0);
-    int y = get_global_id(1); 
-  
+    int y = get_global_id(1);
+
    if ((x < cols) && (y < rows))
    {
        __global int  *src0_y = (__global int * )((__global char *)mat_src0 + y * src0_step);
@@ -1123,8 +1123,8 @@ __kernel void merge_vector_C3_D5_1(int rows, int cols,
                                   __global float *mat_src2, int src2_step)
 {
    int x = get_global_id(0);
-    int y = get_global_id(1); 
-  
+    int y = get_global_id(1);
+
    if ((x < cols) && (y < rows))
    {
        __global float  *src0_y = (__global float * )((__global char *)mat_src0 + y * src0_step);
@@ -1151,8 +1151,8 @@ __kernel void merge_vector_C3_D6_1(int rows, int cols,
                                   __global double *mat_src2, int src2_step)
 {
    int x = get_global_id(0);
-    int y = get_global_id(1); 
-  
+    int y = get_global_id(1);
+
    if ((x < cols) && (y < rows))
    {
        __global double  *src0_y = (__global double * )((__global char *)mat_src0 + y * src0_step);
@@ -1179,8 +1179,8 @@ __kernel void merge_vector_C4_D0_1(int rows, int cols,
                                   __global uchar *mat_src3, int src3_step)
 {
    int x = get_global_id(0);
-    int y = get_global_id(1); 
-  
+    int y = get_global_id(1);
+
    if ((x < cols) && (y < rows))
    {
        __global uchar4  *src0_y = (__global uchar4 * )(mat_src0 + y * src0_step);
@@ -1196,7 +1196,7 @@ __kernel void merge_vector_C4_D0_1(int rows, int cols,
        uchar4 value3 = src3_y[x];

        dst_y[x] = (uchar16)(value0.x, value1.x, value2.x, value3.x,
-                             value0.y, value1.y, value2.y, value3.y,   
+                             value0.y, value1.y, value2.y, value3.y,
                             value0.z, value1.z, value2.z, value3.z,
                             value0.w, value1.w, value2.w, value3.w);
    }
@@ -1210,8 +1210,8 @@ __kernel void merge_vector_C4_D1_1(int rows, int cols,
                                   __global char *mat_src3, int src3_step)
 {
    int x = get_global_id(0);
-    int y = get_global_id(1); 
-  
+    int y = get_global_id(1);
+
    if ((x < cols) && (y < rows))
    {
        __global char4  *src0_y = (__global char4 * )(mat_src0 + y * src0_step);
@@ -1227,7 +1227,7 @@ __kernel void merge_vector_C4_D1_1(int rows, int cols,
        char4 value3 = src3_y[x];

        dst_y[x] = (char16)(value0.x, value1.x, value2.x, value3.x,
-                            value0.y, value1.y, value2.y, value3.y,   
+                            value0.y, value1.y, value2.y, value3.y,
                            value0.z, value1.z, value2.z, value3.z,
                            value0.w, value1.w, value2.w, value3.w);
    }
@@ -1240,8 +1240,8 @@ __kernel void merge_vector_C4_D2_1(int rows, int cols,
                                   __global ushort *mat_src3, int src3_step)
 {
    int x = get_global_id(0);
-    int y = get_global_id(1); 
-  
+    int y = get_global_id(1);
+
    if ((x < cols) && (y < rows))
    {
        __global ushort2  *src0_y = (__global ushort2 * )((__global uchar*)mat_src0 + y * src0_step);
@@ -1257,7 +1257,7 @@ __kernel void merge_vector_C4_D2_1(int rows, int cols,
        ushort2 value3 = src3_y[x];

        dst_y[x] = (ushort8)(value0.x, value1.x, value2.x, value3.x,
-                             value0.y, value1.y, value2.y, value3.y);   
+                             value0.y, value1.y, value2.y, value3.y);
    }
 }
 __kernel void merge_vector_C4_D3_1(int rows, int cols,
@@ -1268,8 +1268,8 @@ __kernel void merge_vector_C4_D3_1(int rows, int cols,
                                   __global short *mat_src3, int src3_step)
 {
    int x = get_global_id(0);
-    int y = get_global_id(1); 
-  
+    int y = get_global_id(1);
+
    if ((x < cols) && (y < rows))
    {
        __global short2  *src0_y = (__global short2 * )((__global uchar*)mat_src0 + y * src0_step);
@@ -1285,7 +1285,7 @@ __kernel void merge_vector_C4_D3_1(int rows, int cols,
        short2 value3 = src3_y[x];

        dst_y[x] = (short8)(value0.x, value1.x, value2.x, value3.x,
-                            value0.y, value1.y, value2.y, value3.y);   
+                            value0.y, value1.y, value2.y, value3.y);
    }
 }
 __kernel void merge_vector_C4_D4_1(int rows, int cols,
@@ -1296,8 +1296,8 @@ __kernel void merge_vector_C4_D4_1(int rows, int cols,
                                   __global int *mat_src3, int src3_step)
 {
    int x = get_global_id(0);
-    int y = get_global_id(1); 
-  
+    int y = get_global_id(1);
+
    if ((x < cols) && (y < rows))
    {
        __global int *src0_y = (__global int * )((__global uchar*)mat_src0 + y * src0_step);
@@ -1323,8 +1323,8 @@ __kernel void merge_vector_C4_D5_1(int rows, int cols,
                                   __global float *mat_src3, int src3_step)
 {
    int x = get_global_id(0);
-    int y = get_global_id(1); 
-  
+    int y = get_global_id(1);
+
    if ((x < cols) && (y < rows))
    {
        __global float *src0_y = (__global float * )((__global uchar*)mat_src0 + y * src0_step);
@@ -1352,8 +1352,8 @@ __kernel void merge_vector_C4_D6_1(int rows, int cols,
                                   __global double *mat_src3, int src3_step)
 {
    int x = get_global_id(0);
-    int y = get_global_id(1); 
-  
+    int y = get_global_id(1);
+
    if ((x < cols) && (y < rows))
    {
        __global double *src0_y = (__global double * )((__global uchar*)mat_src0 + y * src0_step);
--- a/modules/ocl/src/kernels/nonfree_surf.cl
+++ b/modules/ocl/src/kernels/nonfree_surf.cl
@@ -210,7 +210,7 @@ __kernel void icvCalcLayerDetAndTrace(
        const float dxy = icvCalcHaarPatternSum_4(sumTex, c_DXY, 9, size, i << c_octave, j << c_octave);

        det  [j + margin + det_step   * (layer * c_layer_rows + i + margin)] = dx * dy - 0.81f * dxy * dxy;
-        trace[j + margin + trace_step * (layer * c_layer_rows + i + margin)] = dx + dy; 
+        trace[j + margin + trace_step * (layer * c_layer_rows + i + margin)] = dx + dy;
    }
 }

@@ -246,9 +246,9 @@ bool within_check(image2d_t maskSumTex, int sum_i, int sum_j, int size)
 // Non-maximal suppression to further filtering the candidates from previous step
 __kernel
    void icvFindMaximaInLayer_withmask(
-    __global const float * det, 
-    __global const float * trace, 
-    __global int4 * maxPosBuffer, 
+    __global const float * det,
+    __global const float * trace,
+    __global int4 * maxPosBuffer,
    volatile __global unsigned int* maxCounter,
    int counter_offset,
    int det_step,     // the step of det in bytes
@@ -288,26 +288,26 @@ __kernel
    // Is this thread within the hessian buffer?
    const int zoff = get_local_size(0) * get_local_size(1);
    const int localLin = get_local_id(0) + get_local_id(1) * get_local_size(0) + zoff;
-    N9[localLin - zoff] = 
-        det[det_step * 
+    N9[localLin - zoff] =
+        det[det_step *
        (c_layer_rows * (layer - 1) + min(max(i, 0), c_img_rows - 1)) // y
        + min(max(j, 0), c_img_cols - 1)];                            // x
-    N9[localLin       ] = 
-        det[det_step * 
+    N9[localLin       ] =
+        det[det_step *
        (c_layer_rows * (layer    ) + min(max(i, 0), c_img_rows - 1)) // y
        + min(max(j, 0), c_img_cols - 1)];                            // x
-    N9[localLin + zoff] = 
-        det[det_step * 
+    N9[localLin + zoff] =
+        det[det_step *
        (c_layer_rows * (layer + 1) + min(max(i, 0), c_img_rows - 1)) // y
        + min(max(j, 0), c_img_cols - 1)];                            // x

    barrier(CLK_LOCAL_MEM_FENCE);

-    if (i < c_layer_rows - margin 
+    if (i < c_layer_rows - margin
        && j < c_layer_cols - margin
-        && get_local_id(0) > 0 
+        && get_local_id(0) > 0
        && get_local_id(0) < get_local_size(0) - 1
-        && get_local_id(1) > 0 
+        && get_local_id(1) > 0
        && get_local_id(1) < get_local_size(1) - 1 // these are unnecessary conditions ported from CUDA
        )
    {
@@ -372,9 +372,9 @@ __kernel

 __kernel
    void icvFindMaximaInLayer(
-    __global float * det, 
-    __global float * trace, 
-    __global int4 * maxPosBuffer, 
+    __global float * det,
+    __global float * trace,
+    __global int4 * maxPosBuffer,
    volatile __global unsigned int* maxCounter,
    int counter_offset,
    int det_step,     // the step of det in bytes
@@ -417,19 +417,19 @@ __kernel
    int l_x = min(max(j, 0), c_img_cols - 1);
    int l_y = c_layer_rows * layer + min(max(i, 0), c_img_rows - 1);

-    N9[localLin - zoff] = 
+    N9[localLin - zoff] =
        det[det_step * (l_y - c_layer_rows) + l_x];
-    N9[localLin       ] = 
+    N9[localLin       ] =
        det[det_step * (l_y               ) + l_x];
-    N9[localLin + zoff] = 
+    N9[localLin + zoff] =
        det[det_step * (l_y + c_layer_rows) + l_x];
    barrier(CLK_LOCAL_MEM_FENCE);

-    if (i < c_layer_rows - margin 
+    if (i < c_layer_rows - margin
        && j < c_layer_cols - margin
-        && get_local_id(0) > 0 
+        && get_local_id(0) > 0
        && get_local_id(0) < get_local_size(0) - 1
-        && get_local_id(1) > 0 
+        && get_local_id(1) > 0
        && get_local_id(1) < get_local_size(1) - 1 // these are unnecessary conditions ported from CUDA
        )
    {
@@ -497,17 +497,17 @@ inline bool solve3x3_float(volatile __local  const float A[3][3], volatile __loc
    {
        F invdet = 1.0 / det;

-        x[0] = invdet * 
+        x[0] = invdet *
            (b[0]    * (A[1][1] * A[2][2] - A[1][2] * A[2][1]) -
            A[0][1] * (b[1]    * A[2][2] - A[1][2] * b[2]   ) +
            A[0][2] * (b[1]    * A[2][1] - A[1][1] * b[2]   ));

-        x[1] = invdet * 
+        x[1] = invdet *
            (A[0][0] * (b[1]    * A[2][2] - A[1][2] * b[2]   ) -
            b[0]    * (A[1][0] * A[2][2] - A[1][2] * A[2][0]) +
            A[0][2] * (A[1][0] * b[2]    - b[1]    * A[2][0]));

-        x[2] = invdet * 
+        x[2] = invdet *
            (A[0][0] * (A[1][1] * b[2]    - b[1]    * A[2][1]) -
            A[0][1] * (A[1][0] * b[2]    - b[1]    * A[2][0]) +
            b[0]    * (A[1][0] * A[2][1] - A[1][1] * A[2][0]));
@@ -528,9 +528,9 @@ inline bool solve3x3_float(volatile __local  const float A[3][3], volatile __loc

 ////////////////////////////////////////////////////////////////////////
 // INTERPOLATION
-__kernel 
+__kernel
    void icvInterpolateKeypoint(
-    __global const float * det, 
+    __global const float * det,
    __global const int4 * maxPosBuffer,
    __global float * keypoints,
    volatile __global unsigned int * featureCounter,
@@ -560,7 +560,7 @@ __kernel

    volatile __local  float N9[3][3][3];

-    N9[get_local_id(2)][get_local_id(1)][get_local_id(0)] = 
+    N9[get_local_id(2)][get_local_id(1)][get_local_id(0)] =
        det[det_step * (c_layer_rows * layer + i) + j];
    barrier(CLK_LOCAL_MEM_FENCE);

@@ -658,27 +658,27 @@ __kernel

 __constant float c_aptX[ORI_SAMPLES] = {-6, -5, -5, -5, -5, -5, -5, -5, -4, -4, -4, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6};
 __constant float c_aptY[ORI_SAMPLES] = {0, -3, -2, -1, 0, 1, 2, 3, -4, -3, -2, -1, 0, 1, 2, 3, 4, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -4, -3, -2, -1, 0, 1, 2, 3, 4, -3, -2, -1, 0, 1, 2, 3, 0};
-__constant float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 
-    0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 
-    0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 
-    0.002003900473937392f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 
-    0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 
-    0.0035081731621176f, 0.001707611023448408f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f, 
-    0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f, 
-    0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.003238451667129993f, 0.00665318313986063f, 
-    0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 
-    0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.001455130288377404f, 
-    0.0035081731621176f, 0.00720730796456337f, 0.01261763460934162f, 0.0188232995569706f, 0.02392910048365593f, 
-    0.02592208795249462f, 0.02392910048365593f, 0.0188232995569706f, 0.01261763460934162f, 0.00720730796456337f, 
-    0.0035081731621176f, 0.001455130288377404f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f, 
-    0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f, 
+__constant float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f,
+    0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f,
+    0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f,
+    0.002003900473937392f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f,
+    0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f,
+    0.0035081731621176f, 0.001707611023448408f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f,
+    0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f,
+    0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.003238451667129993f, 0.00665318313986063f,
+    0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f,
+    0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.001455130288377404f,
+    0.0035081731621176f, 0.00720730796456337f, 0.01261763460934162f, 0.0188232995569706f, 0.02392910048365593f,
+    0.02592208795249462f, 0.02392910048365593f, 0.0188232995569706f, 0.01261763460934162f, 0.00720730796456337f,
+    0.0035081731621176f, 0.001455130288377404f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f,
+    0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f,
    0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.002547456417232752f, 0.005233579315245152f,
-    0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 
-    0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.001707611023448408f, 
-    0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f, 
+    0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f,
+    0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.001707611023448408f,
+    0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f,
    0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f,
-    0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, 
-    0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, 
+    0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f,
+    0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f,
    0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f,
    0.001707611023448408f, 0.001455130288377404f};

@@ -691,13 +691,13 @@ void reduce_32_sum(volatile __local  float * data, float partial_reduction, int
    data[tid] = partial_reduction;
    barrier(CLK_LOCAL_MEM_FENCE);

-    if (tid < 16) 
+    if (tid < 16)
    {
        data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
        data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);
        data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
        data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
-        data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); 
+        data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]);
    }
 #undef op
 }
@@ -758,7 +758,7 @@ __kernel
            Y = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NY, 4, grad_wav_size, y, x);

            angle = atan2(Y, X);
-            
+
            if (angle < 0)
                angle += 2.0f * CV_PI_F;
            angle *= 180.0f / CV_PI_F;
@@ -769,7 +769,7 @@ __kernel
    s_Y[tid] = Y;
    s_angle[tid] = angle;
    barrier(CLK_LOCAL_MEM_FENCE);
-    
+
    float bestx = 0, besty = 0, best_mod = 0;

 #pragma unroll
@@ -881,8 +881,8 @@ __constant float c_DW[PATCH_SZ * PATCH_SZ] =

 // utility for linear filter
 inline uchar readerGet(
-    image2d_t src, 
-    const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir, 
+    image2d_t src,
+    const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
    int i, int j
    )
 {
@@ -892,8 +892,8 @@ inline uchar readerGet(
 }

 inline float linearFilter(
-    image2d_t src, 
-    const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,  
+    image2d_t src,
+    const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
    float y, float x
    )
 {
@@ -927,9 +927,9 @@ void calc_dx_dy(
    volatile __local  float s_dx_bin[25],
    volatile __local  float s_dy_bin[25],
    volatile __local  float s_PATCH[6][6],
-    __global const float* featureX, 
-    __global const float* featureY, 
-    __global const float* featureSize, 
+    __global const float* featureX,
+    __global const float* featureY,
+    __global const float* featureSize,
    __global const float* featureDir
    )
 {
@@ -976,26 +976,26 @@ void calc_dx_dy(
        const float dw = c_DW[yIndex * PATCH_SZ + xIndex];

        const float vx = (
-            s_PATCH[get_local_id(1)    ][get_local_id(0) + 1] - 
-            s_PATCH[get_local_id(1)    ][get_local_id(0)    ] + 
-            s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] - 
-            s_PATCH[get_local_id(1) + 1][get_local_id(0)    ]) 
+            s_PATCH[get_local_id(1)    ][get_local_id(0) + 1] -
+            s_PATCH[get_local_id(1)    ][get_local_id(0)    ] +
+            s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] -
+            s_PATCH[get_local_id(1) + 1][get_local_id(0)    ])
            * dw;
        const float vy = (
-            s_PATCH[get_local_id(1) + 1][get_local_id(0)    ] - 
-            s_PATCH[get_local_id(1)    ][get_local_id(0)    ] + 
-            s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] - 
-            s_PATCH[get_local_id(1)    ][get_local_id(0) + 1]) 
+            s_PATCH[get_local_id(1) + 1][get_local_id(0)    ] -
+            s_PATCH[get_local_id(1)    ][get_local_id(0)    ] +
+            s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] -
+            s_PATCH[get_local_id(1)    ][get_local_id(0) + 1])
            * dw;
        s_dx_bin[tid] = vx;
        s_dy_bin[tid] = vy;
    }
 }
 void reduce_sum25(
-    volatile __local  float* sdata1, 
-    volatile __local  float* sdata2, 
-    volatile __local  float* sdata3, 
-    volatile __local  float* sdata4, 
+    volatile __local  float* sdata1,
+    volatile __local  float* sdata2,
+    volatile __local  float* sdata3,
+    volatile __local  float* sdata4,
    int tid
    )
 {
@@ -1033,10 +1033,10 @@ void reduce_sum25(
    }
 }

-__kernel 
+__kernel
    void compute_descriptors64(
    image2d_t imgTex,
-    volatile __global float * descriptors, 
+    volatile __global float * descriptors,
    __global const float * keypoints,
    int descriptors_step,
    int keypoints_step
@@ -1083,10 +1083,10 @@ __kernel
        }
    }
 }
-__kernel 
+__kernel
    void compute_descriptors128(
    image2d_t imgTex,
-    __global volatile float * descriptors, 
+    __global volatile float * descriptors,
    __global float * keypoints,
    int descriptors_step,
    int keypoints_step
@@ -1178,7 +1178,7 @@ __kernel
    }
 }

-__kernel 
+__kernel
    void normalize_descriptors128(__global float * descriptors, int descriptors_step)
 {
    descriptors_step /= sizeof(*descriptors);
@@ -1219,7 +1219,7 @@ __kernel
    // normalize and store in output
    descriptor_base[get_local_id(0)] = lookup / len;
 }
-__kernel 
+__kernel
    void normalize_descriptors64(__global float * descriptors, int descriptors_step)
 {
    descriptors_step /= sizeof(*descriptors);
--- a/modules/ocl/src/kernels/objdetect_hog.cl
+++ b/modules/ocl/src/kernels/objdetect_hog.cl
@@ -54,10 +54,10 @@
 //----------------------------------------------------------------------------
 // Histogram computation

-__kernel void compute_hists_kernel(const int width, const int cblock_stride_x, const int cblock_stride_y, 
-                                   const int cnbins, const int cblock_hist_size, const int img_block_width, 
-                                   const int grad_quadstep, const int qangle_step, 
-                                   __global const float* grad, __global const uchar* qangle, 
+__kernel void compute_hists_kernel(const int width, const int cblock_stride_x, const int cblock_stride_y,
+                                   const int cnbins, const int cblock_hist_size, const int img_block_width,
+                                   const int grad_quadstep, const int qangle_step,
+                                   __global const float* grad, __global const uchar* qangle,
                                   const float scale, __global float* block_hists, __local float* smem)
 {
    const int lidX = get_local_id(0);
@@ -213,10 +213,10 @@ __kernel void classify_hists_kernel(const int cblock_hist_size, const int cdescr
    products[tid] = product;

    barrier(CLK_LOCAL_MEM_FENCE);
- 
+
    if (tid < 128) products[tid] = product = product + products[tid + 128];
    barrier(CLK_LOCAL_MEM_FENCE);
-    
+
    if (tid < 64) products[tid] = product = product + products[tid + 64];
    barrier(CLK_LOCAL_MEM_FENCE);

@@ -240,12 +240,12 @@ __kernel void classify_hists_kernel(const int cblock_hist_size, const int cdescr

 __kernel void extract_descrs_by_rows_kernel(const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size, const int cdescr_width,
                                            const int img_block_width, const int win_block_stride_x, const int win_block_stride_y,
-									        __global const float* block_hists, __global float* descriptors)
+                                            __global const float* block_hists, __global float* descriptors)
 {
    int tid = get_local_id(0);
    int gidX = get_group_id(0);
    int gidY = get_group_id(1);
-    
+
    // Get left top corner of the window in src
    __global const float* hist = block_hists + (gidY * win_block_stride_y * img_block_width + gidX * win_block_stride_x) * cblock_hist_size;

@@ -261,7 +261,7 @@ __kernel void extract_descrs_by_rows_kernel(const int cblock_hist_size, const in
    }
 }

-__kernel void extract_descrs_by_cols_kernel(const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size, 
+__kernel void extract_descrs_by_cols_kernel(const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size,
                                            const int cnblocks_win_x, const int cnblocks_win_y, const int img_block_width, const int win_block_stride_x,
                                            const int win_block_stride_y, __global const float* block_hists, __global float* descriptors)
 {
@@ -291,8 +291,8 @@ __kernel void extract_descrs_by_cols_kernel(const int cblock_hist_size, const in
 //----------------------------------------------------------------------------
 // Gradients computation

-__kernel void compute_gradients_8UC4_kernel(const int height, const int width, const int img_step, const int grad_quadstep, const int qangle_step, 
-                                            const __global uchar4 * img, __global float * grad, __global uchar * qangle, 
+__kernel void compute_gradients_8UC4_kernel(const int height, const int width, const int img_step, const int grad_quadstep, const int qangle_step,
+                                            const __global uchar4 * img, __global float * grad, __global uchar * qangle,
                                            const float angle_scale, const char correct_gamma, const int cnbins)
 {
    const int x = get_global_id(0);
@@ -391,7 +391,7 @@ __kernel void compute_gradients_8UC4_kernel(const int height, const int width, c
 }

 __kernel void compute_gradients_8UC1_kernel(const int height, const int width, const int img_step, const int grad_quadstep, const int qangle_step,
-                                            __global const uchar * img, __global float * grad, __global uchar * qangle, 
+                                            __global const uchar * img, __global float * grad, __global uchar * qangle,
                                            const float angle_scale, const char correct_gamma, const int cnbins)
 {
    const int x = get_global_id(0);
@@ -453,37 +453,37 @@ __kernel void compute_gradients_8UC1_kernel(const int height, const int width, c
 // Resize

 __kernel void resize_8UC4_kernel(__global uchar4 * dst, __global const uchar4 * src,
-                                 int dst_offset, int src_offset, int dst_step, int src_step, 
+                                 int dst_offset, int src_offset, int dst_step, int src_step,
                                 int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify )
 {
    int dx = get_global_id(0);
    int dy = get_global_id(1);
-    
+
    int sx = (int)floor(dx*ifx+0.5f);
    int sy = (int)floor(dy*ify+0.5f);
    sx = min(sx, src_cols-1);
    sy = min(sy, src_rows-1);
    int dpos = (dst_offset>>2) + dy * (dst_step>>2) + dx;
    int spos = (src_offset>>2) + sy * (src_step>>2) + sx;
-    
+
    if(dx<dst_cols && dy<dst_rows)
        dst[dpos] = src[spos];
 }

 __kernel void resize_8UC1_kernel(__global uchar * dst, __global const uchar * src,
-                                 int dst_offset, int src_offset, int dst_step, int src_step, 
+                                 int dst_offset, int src_offset, int dst_step, int src_step,
                                 int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify )
 {
    int dx = get_global_id(0);
    int dy = get_global_id(1);
-    
+
    int sx = (int)floor(dx*ifx+0.5f);
    int sy = (int)floor(dy*ify+0.5f);
    sx = min(sx, src_cols-1);
    sy = min(sy, src_rows-1);
    int dpos = dst_offset + dy * dst_step + dx;
    int spos = src_offset + sy * src_step + sx;
-    
+
    if(dx<dst_cols && dy<dst_rows)
        dst[dpos] = src[spos];
 }
--- a/modules/ocl/src/kernels/operator_convertTo.cl
+++ b/modules/ocl/src/kernels/operator_convertTo.cl
@@ -37,348 +37,348 @@
 #define F2 float2
 #define F4 float4
 __kernel void convert_to_S4_C1_D0(
-		__global const int* restrict srcMat,
-		__global uchar* dstMat,
-		int cols,
-		int rows,
-		int srcStep_in_pixel,
-		int srcoffset_in_pixel, 		
-		int dstStep_in_pixel,
-		int dstoffset_in_pixel, 
-		F alpha,
-		F beta)
+        __global const int* restrict srcMat,
+        __global uchar* dstMat,
+        int cols,
+        int rows,
+        int srcStep_in_pixel,
+        int srcoffset_in_pixel,
+        int dstStep_in_pixel,
+        int dstoffset_in_pixel,
+        F alpha,
+        F beta)
 {
-		int x=get_global_id(0)<<2;
-		int y=get_global_id(1);
-		//int src_addr_start = mad24(y,srcStep_in_pixel,srcoffset_in_pixel);
-		//int src_addr_end = mad24(y,srcStep_in_pixel,cols+srcoffset_in_pixel);
-		int off_src = (dstoffset_in_pixel & 3);	
-		int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel - off_src);		
-		int dst_addr_start = mad24(y,dstStep_in_pixel,dstoffset_in_pixel);
-		int dst_addr_end = mad24(y,dstStep_in_pixel,cols+dstoffset_in_pixel);
-		int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel & (int)0xfffffffc);
-		if(x+3<cols && y<rows && off_src==0)
-		{
-			float4 temp_src = convert_float4(vload4(0,srcMat+srcidx));
-			*(__global uchar4*)(dstMat+dstidx) = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
-		}
-		else
-		{
-			if(x+3<cols && y<rows)
-			{
-				float4 temp_src = convert_float4(vload4(0,srcMat+srcidx));
-				uchar4 temp_dst = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
-				dstMat[dstidx] = temp_dst.x;
-				dstMat[dstidx+1] = temp_dst.y;
-				dstMat[dstidx+2] = temp_dst.z;
-				dstMat[dstidx+3] = temp_dst.w;
-			}
-			else if(x+2<cols && y<rows)
-			{
-				float4 temp_src = convert_float4(vload4(0,srcMat+srcidx));
-				uchar4 temp_dst = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
-				dstMat[dstidx] = temp_dst.x;
-				dstMat[dstidx+1] = temp_dst.y;
-				dstMat[dstidx+2] = temp_dst.z;			
-			}
-			else if(x+1<cols && y<rows)
-			{
-				float2 temp_src = convert_float2(vload2(0,srcMat+srcidx));
-				uchar2 temp_dst = convert_uchar2_sat(temp_src*(F2)alpha+(F2)beta);
-				dstMat[dstidx] = temp_dst.x;
-				dstMat[dstidx+1] = temp_dst.y;		
-			}	
-			else if(x<cols && y<rows)
-			{
-				dstMat[dstidx] = convert_uchar_sat(convert_float(srcMat[srcidx])*alpha+beta);;	
-			}				
-		}
+        int x=get_global_id(0)<<2;
+        int y=get_global_id(1);
+        //int src_addr_start = mad24(y,srcStep_in_pixel,srcoffset_in_pixel);
+        //int src_addr_end = mad24(y,srcStep_in_pixel,cols+srcoffset_in_pixel);
+        int off_src = (dstoffset_in_pixel & 3);
+        int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel - off_src);
+        int dst_addr_start = mad24(y,dstStep_in_pixel,dstoffset_in_pixel);
+        int dst_addr_end = mad24(y,dstStep_in_pixel,cols+dstoffset_in_pixel);
+        int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel & (int)0xfffffffc);
+        if(x+3<cols && y<rows && off_src==0)
+        {
+            float4 temp_src = convert_float4(vload4(0,srcMat+srcidx));
+            *(__global uchar4*)(dstMat+dstidx) = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
+        }
+        else
+        {
+            if(x+3<cols && y<rows)
+            {
+                float4 temp_src = convert_float4(vload4(0,srcMat+srcidx));
+                uchar4 temp_dst = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
+                dstMat[dstidx] = temp_dst.x;
+                dstMat[dstidx+1] = temp_dst.y;
+                dstMat[dstidx+2] = temp_dst.z;
+                dstMat[dstidx+3] = temp_dst.w;
+            }
+            else if(x+2<cols && y<rows)
+            {
+                float4 temp_src = convert_float4(vload4(0,srcMat+srcidx));
+                uchar4 temp_dst = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
+                dstMat[dstidx] = temp_dst.x;
+                dstMat[dstidx+1] = temp_dst.y;
+                dstMat[dstidx+2] = temp_dst.z;
+            }
+            else if(x+1<cols && y<rows)
+            {
+                float2 temp_src = convert_float2(vload2(0,srcMat+srcidx));
+                uchar2 temp_dst = convert_uchar2_sat(temp_src*(F2)alpha+(F2)beta);
+                dstMat[dstidx] = temp_dst.x;
+                dstMat[dstidx+1] = temp_dst.y;
+            }
+            else if(x<cols && y<rows)
+            {
+                dstMat[dstidx] = convert_uchar_sat(convert_float(srcMat[srcidx])*alpha+beta);;
+            }
+        }
 }

 __kernel void convert_to_S4_C4_D0(
-		__global const int4* restrict srcMat,
-		__global uchar4* dstMat,
-		int cols,
-		int rows,
-		int srcStep_in_pixel,
-		int srcoffset_in_pixel, 		
-		int dstStep_in_pixel,
-		int dstoffset_in_pixel, 
-		F alpha,
-		F beta)
+        __global const int4* restrict srcMat,
+        __global uchar4* dstMat,
+        int cols,
+        int rows,
+        int srcStep_in_pixel,
+        int srcoffset_in_pixel,
+        int dstStep_in_pixel,
+        int dstoffset_in_pixel,
+        F alpha,
+        F beta)
 {
-		int x=get_global_id(0);
-		int y=get_global_id(1);
-		int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);		
-		int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
-		if ( (x < cols) & (y < rows) )
-		{	
-			float4 temp_src = convert_float4(srcMat[srcidx]);
-			dstMat[dstidx] = convert_uchar4_sat(temp_src*alpha+beta);
-		}
+        int x=get_global_id(0);
+        int y=get_global_id(1);
+        int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
+        int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
+        if ( (x < cols) & (y < rows) )
+        {
+            float4 temp_src = convert_float4(srcMat[srcidx]);
+            dstMat[dstidx] = convert_uchar4_sat(temp_src*alpha+beta);
+        }
 }

 __kernel void convert_to_S5_C1_D0(
-		__global const float* restrict srcMat,
-		__global uchar* dstMat,
-		int cols,
-		int rows,
-		int srcStep_in_pixel,
-		int srcoffset_in_pixel, 		
-		int dstStep_in_pixel,
-		int dstoffset_in_pixel, 
-		F alpha,
-		F beta)
+        __global const float* restrict srcMat,
+        __global uchar* dstMat,
+        int cols,
+        int rows,
+        int srcStep_in_pixel,
+        int srcoffset_in_pixel,
+        int dstStep_in_pixel,
+        int dstoffset_in_pixel,
+        F alpha,
+        F beta)
 {
-		int x=get_global_id(0)<<2;
-		int y=get_global_id(1);
-		//int src_addr_start = mad24(y,srcStep_in_pixel,srcoffset_in_pixel);
-		//int src_addr_end = mad24(y,srcStep_in_pixel,cols+srcoffset_in_pixel);
-		int off_src = (dstoffset_in_pixel & 3);	
-		int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel - off_src);		
-		int dst_addr_start = mad24(y,dstStep_in_pixel,dstoffset_in_pixel);
-		int dst_addr_end = mad24(y,dstStep_in_pixel,cols+dstoffset_in_pixel);
-		int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel & (int)0xfffffffc);
-		if(x+3<cols && y<rows && off_src==0)
-		{
-			float4 temp_src = vload4(0,srcMat+srcidx);
-			*(__global uchar4*)(dstMat+dstidx) = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
-		}
-		else
-		{
-			if(x+3<cols && y<rows)
-			{
-				float4 temp_src = vload4(0,srcMat+srcidx);
-				uchar4 temp_dst = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
-				dstMat[dstidx] = temp_dst.x;
-				dstMat[dstidx+1] = temp_dst.y;
-				dstMat[dstidx+2] = temp_dst.z;
-				dstMat[dstidx+3] = temp_dst.w;
-			}
-			else if(x+2<cols && y<rows)
-			{
-				float4 temp_src = vload4(0,srcMat+srcidx);
-				uchar4 temp_dst = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
-				dstMat[dstidx] = temp_dst.x;
-				dstMat[dstidx+1] = temp_dst.y;
-				dstMat[dstidx+2] = temp_dst.z;			
-			}
-			else if(x+1<cols && y<rows)
-			{
-				float2 temp_src = vload2(0,srcMat+srcidx);
-				uchar2 temp_dst = convert_uchar2_sat(temp_src*(F2)alpha+(F2)beta);
-				dstMat[dstidx] = temp_dst.x;
-				dstMat[dstidx+1] = temp_dst.y;		
-			}	
-			else if(x<cols && y<rows)
-			{
-				dstMat[dstidx] = convert_uchar_sat(srcMat[srcidx]*alpha+beta);;	
-			}				
-		}
+        int x=get_global_id(0)<<2;
+        int y=get_global_id(1);
+        //int src_addr_start = mad24(y,srcStep_in_pixel,srcoffset_in_pixel);
+        //int src_addr_end = mad24(y,srcStep_in_pixel,cols+srcoffset_in_pixel);
+        int off_src = (dstoffset_in_pixel & 3);
+        int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel - off_src);
+        int dst_addr_start = mad24(y,dstStep_in_pixel,dstoffset_in_pixel);
+        int dst_addr_end = mad24(y,dstStep_in_pixel,cols+dstoffset_in_pixel);
+        int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel & (int)0xfffffffc);
+        if(x+3<cols && y<rows && off_src==0)
+        {
+            float4 temp_src = vload4(0,srcMat+srcidx);
+            *(__global uchar4*)(dstMat+dstidx) = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
+        }
+        else
+        {
+            if(x+3<cols && y<rows)
+            {
+                float4 temp_src = vload4(0,srcMat+srcidx);
+                uchar4 temp_dst = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
+                dstMat[dstidx] = temp_dst.x;
+                dstMat[dstidx+1] = temp_dst.y;
+                dstMat[dstidx+2] = temp_dst.z;
+                dstMat[dstidx+3] = temp_dst.w;
+            }
+            else if(x+2<cols && y<rows)
+            {
+                float4 temp_src = vload4(0,srcMat+srcidx);
+                uchar4 temp_dst = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
+                dstMat[dstidx] = temp_dst.x;
+                dstMat[dstidx+1] = temp_dst.y;
+                dstMat[dstidx+2] = temp_dst.z;
+            }
+            else if(x+1<cols && y<rows)
+            {
+                float2 temp_src = vload2(0,srcMat+srcidx);
+                uchar2 temp_dst = convert_uchar2_sat(temp_src*(F2)alpha+(F2)beta);
+                dstMat[dstidx] = temp_dst.x;
+                dstMat[dstidx+1] = temp_dst.y;
+            }
+            else if(x<cols && y<rows)
+            {
+                dstMat[dstidx] = convert_uchar_sat(srcMat[srcidx]*alpha+beta);;
+            }
+        }
 }
 __kernel void convert_to_S5_C4_D0(
-		__global const float4* restrict srcMat,
-		__global uchar4* dstMat,
-		int cols,
-		int rows,
-		int srcStep_in_pixel,
-		int srcoffset_in_pixel, 		
-		int dstStep_in_pixel,
-		int dstoffset_in_pixel, 
-		F alpha,
-		F beta)
+        __global const float4* restrict srcMat,
+        __global uchar4* dstMat,
+        int cols,
+        int rows,
+        int srcStep_in_pixel,
+        int srcoffset_in_pixel,
+        int dstStep_in_pixel,
+        int dstoffset_in_pixel,
+        F alpha,
+        F beta)
 {
-		int x=get_global_id(0);
-		int y=get_global_id(1);
-		int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);		
-		int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
-		if ( (x < cols) & (y < rows) )
-		{	
-			float4 temp_src = srcMat[srcidx];
-			dstMat[dstidx] = convert_uchar4_sat(temp_src*alpha+beta);
-		}
+        int x=get_global_id(0);
+        int y=get_global_id(1);
+        int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
+        int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
+        if ( (x < cols) & (y < rows) )
+        {
+            float4 temp_src = srcMat[srcidx];
+            dstMat[dstidx] = convert_uchar4_sat(temp_src*alpha+beta);
+        }
 }

 __kernel void convert_to_S0_C1_D4(
-		__global const uchar* restrict srcMat,
-		__global int* dstMat,
-		int cols,
-		int rows,
-		int srcStep_in_pixel,
-		int srcoffset_in_pixel, 		
-		int dstStep_in_pixel,
-		int dstoffset_in_pixel, 
-		F alpha,
-		F beta)
+        __global const uchar* restrict srcMat,
+        __global int* dstMat,
+        int cols,
+        int rows,
+        int srcStep_in_pixel,
+        int srcoffset_in_pixel,
+        int dstStep_in_pixel,
+        int dstoffset_in_pixel,
+        F alpha,
+        F beta)
 {
-		int x=get_global_id(0);
-		int y=get_global_id(1);
-		int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);		
-		int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
-		if ( (x < cols) & (y < rows) )
-		{	
-			float temp_src = convert_float(srcMat[srcidx]);
-			dstMat[dstidx] = convert_int_sat(temp_src*alpha+beta);
-		}
+        int x=get_global_id(0);
+        int y=get_global_id(1);
+        int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
+        int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
+        if ( (x < cols) & (y < rows) )
+        {
+            float temp_src = convert_float(srcMat[srcidx]);
+            dstMat[dstidx] = convert_int_sat(temp_src*alpha+beta);
+        }
 }

 __kernel void convert_to_S5_C1_D4(
-		__global const float* restrict srcMat,
-		__global int* dstMat,
-		int cols,
-		int rows,
-		int srcStep_in_pixel,
-		int srcoffset_in_pixel, 		
-		int dstStep_in_pixel,
-		int dstoffset_in_pixel, 
-		F alpha,
-		F beta)
+        __global const float* restrict srcMat,
+        __global int* dstMat,
+        int cols,
+        int rows,
+        int srcStep_in_pixel,
+        int srcoffset_in_pixel,
+        int dstStep_in_pixel,
+        int dstoffset_in_pixel,
+        F alpha,
+        F beta)
 {
-		int x=get_global_id(0);
-		int y=get_global_id(1);
-		int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);		
-		int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
-		if ( (x < cols) & (y < rows) )
-		{	
-			float temp_src = srcMat[srcidx];
-			dstMat[dstidx] = convert_int_sat(temp_src*alpha+beta);
-		}
+        int x=get_global_id(0);
+        int y=get_global_id(1);
+        int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
+        int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
+        if ( (x < cols) & (y < rows) )
+        {
+            float temp_src = srcMat[srcidx];
+            dstMat[dstidx] = convert_int_sat(temp_src*alpha+beta);
+        }
 }

 __kernel void convert_to_S0_C4_D4(
-		__global const uchar4* restrict srcMat,
-		__global int4* dstMat,
-		int cols,
-		int rows,
-		int srcStep_in_pixel,
-		int srcoffset_in_pixel, 		
-		int dstStep_in_pixel,
-		int dstoffset_in_pixel, 
-		F alpha,
-		F beta)
+        __global const uchar4* restrict srcMat,
+        __global int4* dstMat,
+        int cols,
+        int rows,
+        int srcStep_in_pixel,
+        int srcoffset_in_pixel,
+        int dstStep_in_pixel,
+        int dstoffset_in_pixel,
+        F alpha,
+        F beta)
 {
-		int x=get_global_id(0);
-		int y=get_global_id(1);
-		int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);		
-		int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
-		if ( (x < cols) & (y < rows) )
-		{	
-			float4 temp_src = convert_float4(srcMat[srcidx]);
-			dstMat[dstidx] = convert_int4_sat(temp_src*alpha+beta);
-		}
+        int x=get_global_id(0);
+        int y=get_global_id(1);
+        int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
+        int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
+        if ( (x < cols) & (y < rows) )
+        {
+            float4 temp_src = convert_float4(srcMat[srcidx]);
+            dstMat[dstidx] = convert_int4_sat(temp_src*alpha+beta);
+        }
 }

 __kernel void convert_to_S5_C4_D4(
-		__global const float4* restrict srcMat,
-		__global int4* dstMat,
-		int cols,
-		int rows,
-		int srcStep_in_pixel,
-		int srcoffset_in_pixel, 		
-		int dstStep_in_pixel,
-		int dstoffset_in_pixel, 
-		F alpha,
-		F beta)
+        __global const float4* restrict srcMat,
+        __global int4* dstMat,
+        int cols,
+        int rows,
+        int srcStep_in_pixel,
+        int srcoffset_in_pixel,
+        int dstStep_in_pixel,
+        int dstoffset_in_pixel,
+        F alpha,
+        F beta)
 {
-		int x=get_global_id(0);
-		int y=get_global_id(1);
-		int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);		
-		int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
-		if ( (x < cols) & (y < rows) )
-		{	
-			float4 temp_src = srcMat[srcidx];
-			dstMat[dstidx] = convert_int4_sat(temp_src*alpha+beta);
-		}
+        int x=get_global_id(0);
+        int y=get_global_id(1);
+        int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
+        int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
+        if ( (x < cols) & (y < rows) )
+        {
+            float4 temp_src = srcMat[srcidx];
+            dstMat[dstidx] = convert_int4_sat(temp_src*alpha+beta);
+        }
 }

 __kernel void convert_to_S0_C1_D5(
-		__global const uchar* restrict srcMat,
-		__global float* dstMat,
-		int cols,
-		int rows,
-		int srcStep_in_pixel,
-		int srcoffset_in_pixel, 		
-		int dstStep_in_pixel,
-		int dstoffset_in_pixel, 
-		F alpha,
-		F beta)
+        __global const uchar* restrict srcMat,
+        __global float* dstMat,
+        int cols,
+        int rows,
+        int srcStep_in_pixel,
+        int srcoffset_in_pixel,
+        int dstStep_in_pixel,
+        int dstoffset_in_pixel,
+        F alpha,
+        F beta)
 {
-		int x=get_global_id(0);
-		int y=get_global_id(1);
-		int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);		
-		int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
-		if ( (x < cols) & (y < rows) )
-		{	
-			float temp_src = convert_float(srcMat[srcidx]);
-			dstMat[dstidx] = temp_src*alpha+beta;
-		}
+        int x=get_global_id(0);
+        int y=get_global_id(1);
+        int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
+        int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
+        if ( (x < cols) & (y < rows) )
+        {
+            float temp_src = convert_float(srcMat[srcidx]);
+            dstMat[dstidx] = temp_src*alpha+beta;
+        }
 }

 __kernel void convert_to_S4_C1_D5(
-		__global const int* restrict srcMat,
-		__global float* dstMat,
-		int cols,
-		int rows,
-		int srcStep_in_pixel,
-		int srcoffset_in_pixel, 		
-		int dstStep_in_pixel,
-		int dstoffset_in_pixel, 
-		F alpha,
-		F beta)
+        __global const int* restrict srcMat,
+        __global float* dstMat,
+        int cols,
+        int rows,
+        int srcStep_in_pixel,
+        int srcoffset_in_pixel,
+        int dstStep_in_pixel,
+        int dstoffset_in_pixel,
+        F alpha,
+        F beta)
 {
-		int x=get_global_id(0);
-		int y=get_global_id(1);
-		int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);		
-		int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
-		if ( (x < cols) & (y < rows) )
-		{	
-			float temp_src = convert_float(srcMat[srcidx]);
-			dstMat[dstidx] = temp_src*alpha+beta;
-		}
+        int x=get_global_id(0);
+        int y=get_global_id(1);
+        int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
+        int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
+        if ( (x < cols) & (y < rows) )
+        {
+            float temp_src = convert_float(srcMat[srcidx]);
+            dstMat[dstidx] = temp_src*alpha+beta;
+        }
 }

 __kernel void convert_to_S0_C4_D5(
-		__global const uchar4* restrict srcMat,
-		__global float4* dstMat,
-		int cols,
-		int rows,
-		int srcStep_in_pixel,
-		int srcoffset_in_pixel, 		
-		int dstStep_in_pixel,
-		int dstoffset_in_pixel, 
-		F alpha,
-		F beta)
+        __global const uchar4* restrict srcMat,
+        __global float4* dstMat,
+        int cols,
+        int rows,
+        int srcStep_in_pixel,
+        int srcoffset_in_pixel,
+        int dstStep_in_pixel,
+        int dstoffset_in_pixel,
+        F alpha,
+        F beta)
 {
-		int x=get_global_id(0);
-		int y=get_global_id(1);
-		int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);		
-		int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
-		if ( (x < cols) & (y < rows) )
-		{	
-			float4 temp_src = convert_float4(srcMat[srcidx]);
-			dstMat[dstidx] = temp_src*alpha+beta;
-		}
+        int x=get_global_id(0);
+        int y=get_global_id(1);
+        int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
+        int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
+        if ( (x < cols) & (y < rows) )
+        {
+            float4 temp_src = convert_float4(srcMat[srcidx]);
+            dstMat[dstidx] = temp_src*alpha+beta;
+        }
 }

 __kernel void convert_to_S4_C4_D5(
-		__global const int4* restrict srcMat,
-		__global float4* dstMat,
-		int cols,
-		int rows,
-		int srcStep_in_pixel,
-		int srcoffset_in_pixel, 		
-		int dstStep_in_pixel,
-		int dstoffset_in_pixel, 
-		F alpha,
-		F beta)
+        __global const int4* restrict srcMat,
+        __global float4* dstMat,
+        int cols,
+        int rows,
+        int srcStep_in_pixel,
+        int srcoffset_in_pixel,
+        int dstStep_in_pixel,
+        int dstoffset_in_pixel,
+        F alpha,
+        F beta)
 {
-		int x=get_global_id(0);
-		int y=get_global_id(1);
-		int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);		
-		int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
-		if ( (x < cols) & (y < rows) )
-		{	
-			float4 temp_src = convert_float4(srcMat[srcidx]);
-			dstMat[dstidx] = temp_src*alpha+beta;
-		}
+        int x=get_global_id(0);
+        int y=get_global_id(1);
+        int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
+        int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
+        if ( (x < cols) & (y < rows) )
+        {
+            float4 temp_src = convert_float4(srcMat[srcidx]);
+            dstMat[dstidx] = temp_src*alpha+beta;
+        }
 }
--- a/modules/ocl/src/kernels/operator_copyToM.cl
+++ b/modules/ocl/src/kernels/operator_copyToM.cl
@@ -35,28 +35,28 @@
 //

 __kernel void copy_to_with_mask(
-		__global const GENTYPE* restrict srcMat,
-		__global GENTYPE* dstMat,
-		__global const uchar* restrict maskMat,
-		int cols,
-		int rows,
-		int srcStep_in_pixel,
-		int srcoffset_in_pixel, 		
-		int dstStep_in_pixel,
-		int dstoffset_in_pixel, 
-		int maskStep,
-		int maskoffset)
+        __global const GENTYPE* restrict srcMat,
+        __global GENTYPE* dstMat,
+        __global const uchar* restrict maskMat,
+        int cols,
+        int rows,
+        int srcStep_in_pixel,
+        int srcoffset_in_pixel,
+        int dstStep_in_pixel,
+        int dstoffset_in_pixel,
+        int maskStep,
+        int maskoffset)
 {
-		int x=get_global_id(0);
-		int y=get_global_id(1);
-		x = x< cols ? x: cols-1;
-		y = y< rows ? y: rows-1;		
-		int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);		
-		int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
-		int maskidx = mad24(y,maskStep,x+ maskoffset);
-		uchar mask = maskMat[maskidx];		
-		if (mask)
-		{
-			dstMat[dstidx] = srcMat[srcidx];
-		}
+        int x=get_global_id(0);
+        int y=get_global_id(1);
+        x = x< cols ? x: cols-1;
+        y = y< rows ? y: rows-1;
+        int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
+        int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
+        int maskidx = mad24(y,maskStep,x+ maskoffset);
+        uchar mask = maskMat[maskidx];
+        if (mask)
+        {
+            dstMat[dstidx] = srcMat[srcidx];
+        }
 }
--- a/modules/ocl/src/kernels/operator_setTo.cl
+++ b/modules/ocl/src/kernels/operator_setTo.cl
@@ -38,53 +38,53 @@
 __kernel void set_to_without_mask_C1_D0(uchar scalar,__global uchar * dstMat,
        int cols,int rows,int dstStep_in_pixel,int offset_in_pixel)
 {
-		int x=get_global_id(0)<<2;
-		int y=get_global_id(1);
-		//int addr_start = mad24(y,dstStep_in_pixel,offset_in_pixel);
-		//int addr_end = mad24(y,dstStep_in_pixel,cols+offset_in_pixel);
-		int idx = mad24(y,dstStep_in_pixel,x+ offset_in_pixel);
-		uchar4 out;
-		out.x = out.y = out.z = out.w = scalar;
-	
-		if ( (x+3 < cols) && (y < rows)&& ((offset_in_pixel&3) == 0))
-		{
-			*(__global uchar4*)(dstMat+idx) = out;
-		}
-		else
-		{
-			 if((x+3 < cols) && (y < rows))
-			 {
-				dstMat[idx] = out.x;
-				dstMat[idx+1] = out.y;
-				dstMat[idx+2] = out.z;
-				dstMat[idx+3] = out.w;
-			 }		
-			 if((x+2 < cols) && (y < rows))
-			 {
-				dstMat[idx] = out.x;
-				dstMat[idx+1] = out.y;
-				dstMat[idx+2] = out.z;
-			 }
-			 else if((x+1 < cols) && (y < rows))
-			 {
-				dstMat[idx] = out.x;
-				dstMat[idx+1] = out.y;
-			 }
-			 else if((x < cols) && (y < rows))
-			 {
-				dstMat[idx] = out.x;
-			 }
-		}
+        int x=get_global_id(0)<<2;
+        int y=get_global_id(1);
+        //int addr_start = mad24(y,dstStep_in_pixel,offset_in_pixel);
+        //int addr_end = mad24(y,dstStep_in_pixel,cols+offset_in_pixel);
+        int idx = mad24(y,dstStep_in_pixel,x+ offset_in_pixel);
+        uchar4 out;
+        out.x = out.y = out.z = out.w = scalar;
+
+        if ( (x+3 < cols) && (y < rows)&& ((offset_in_pixel&3) == 0))
+        {
+            *(__global uchar4*)(dstMat+idx) = out;
+        }
+        else
+        {
+             if((x+3 < cols) && (y < rows))
+             {
+                dstMat[idx] = out.x;
+                dstMat[idx+1] = out.y;
+                dstMat[idx+2] = out.z;
+                dstMat[idx+3] = out.w;
+             }
+             if((x+2 < cols) && (y < rows))
+             {
+                dstMat[idx] = out.x;
+                dstMat[idx+1] = out.y;
+                dstMat[idx+2] = out.z;
+             }
+             else if((x+1 < cols) && (y < rows))
+             {
+                dstMat[idx] = out.x;
+                dstMat[idx+1] = out.y;
+             }
+             else if((x < cols) && (y < rows))
+             {
+                dstMat[idx] = out.x;
+             }
+        }
 }

 __kernel void set_to_without_mask(GENTYPE scalar,__global GENTYPE * dstMat,
        int cols,int rows,int dstStep_in_pixel,int offset_in_pixel)
 {
-		int x=get_global_id(0);
-		int y=get_global_id(1);
-		if ( (x < cols) & (y < rows))
-		{
-		    int idx = mad24(y,dstStep_in_pixel,x+ offset_in_pixel);
-			dstMat[idx] = scalar;	
-		}
+        int x=get_global_id(0);
+        int y=get_global_id(1);
+        if ( (x < cols) & (y < rows))
+        {
+            int idx = mad24(y,dstStep_in_pixel,x+ offset_in_pixel);
+            dstMat[idx] = scalar;
+        }
 }
--- a/modules/ocl/src/kernels/operator_setToM.cl
+++ b/modules/ocl/src/kernels/operator_setToM.cl
@@ -34,27 +34,27 @@
 //
 //
 __kernel void set_to_with_mask(
-		GENTYPE scalar,
-		__global GENTYPE * dstMat,
-		int cols,
-		int rows,
-		int dstStep_in_pixel,
-		int dstoffset_in_pixel, 		
+        GENTYPE scalar,
+        __global GENTYPE * dstMat,
+        int cols,
+        int rows,
+        int dstStep_in_pixel,
+        int dstoffset_in_pixel,
        __global const uchar * restrict maskMat,
-		int maskStep,
-		int maskoffset)
+        int maskStep,
+        int maskoffset)
 {
-		int x=get_global_id(0);
-		int y=get_global_id(1);
-		x = x< cols ? x: cols-1;
-		y = y< rows ? y: rows-1;
-		int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
-		int maskidx = mad24(y,maskStep,x+ maskoffset);
-		uchar mask = maskMat[maskidx];		
-		if (mask)
-		{
-			dstMat[dstidx] = scalar;	
-		}
+        int x=get_global_id(0);
+        int y=get_global_id(1);
+        x = x< cols ? x: cols-1;
+        y = y< rows ? y: rows-1;
+        int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
+        int maskidx = mad24(y,maskStep,x+ maskoffset);
+        uchar mask = maskMat[maskidx];
+        if (mask)
+        {
+            dstMat[dstidx] = scalar;
+        }

 }

--- a/modules/ocl/src/kernels/pyr_down.cl
+++ b/modules/ocl/src/kernels/pyr_down.cl
--- a/modules/ocl/src/kernels/pyr_up.cl
+++ b/modules/ocl/src/kernels/pyr_up.cl
--- a/modules/ocl/src/kernels/pyrlk.cl
+++ b/modules/ocl/src/kernels/pyrlk.cl
@@ -75,7 +75,7 @@ __kernel void calcSharrDeriv_vertical_C1_D0(__global const uchar* src, int srcSt
        const uchar src_val0 = (src + (y > 0 ? y-1 : rows > 1 ? 1 : 0) * srcStep)[x];
        const uchar src_val1 = (src + y * srcStep)[x];
        const uchar src_val2 = (src + (y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0) * srcStep)[x];
-        
+
        ((__global short*)((__global char*)dx_buf + y * dx_bufStep / 2))[x] = (src_val0 + src_val2) * 3 + src_val1 * 10;
        ((__global short*)((__global char*)dy_buf + y * dy_bufStep / 2))[x] = src_val2 - src_val0;
    }
@@ -91,7 +91,7 @@ __kernel void calcSharrDeriv_vertical_C4_D0(__global const uchar* src, int srcSt
        const uchar src_val0 = (src + (y > 0 ? y - 1 : 1) * srcStep)[x];
        const uchar src_val1 = (src + y * srcStep)[x];
        const uchar src_val2 = (src + (y < rows - 1 ? y + 1 : rows - 2) * srcStep)[x];
-        
+
        ((__global short*)((__global char*)dx_buf + y * dx_bufStep / 2))[x] = (src_val0 + src_val2) * 3 + src_val1 * 10;
        ((__global short*)((__global char*)dy_buf + y * dy_bufStep / 2))[x] = src_val2 - src_val0;
    }
@@ -209,20 +209,20 @@ void reduce3(float val1, float val2, float val3, __local float* smem1, __local f
    smem3[tid] = val3;
    barrier(CLK_LOCAL_MEM_FENCE);

-    if (tid < 128) 
-    { 
-        smem1[tid] = val1 += smem1[tid + 128]; 
-        smem2[tid] = val2 += smem2[tid + 128]; 
-        smem3[tid] = val3 += smem3[tid + 128]; 
-    } 
+    if (tid < 128)
+    {
+        smem1[tid] = val1 += smem1[tid + 128];
+        smem2[tid] = val2 += smem2[tid + 128];
+        smem3[tid] = val3 += smem3[tid + 128];
+    }
    barrier(CLK_LOCAL_MEM_FENCE);

-    if (tid < 64) 
-    { 
-        smem1[tid] = val1 += smem1[tid + 64]; 
-        smem2[tid] = val2 += smem2[tid + 64]; 
+    if (tid < 64)
+    {
+        smem1[tid] = val1 += smem1[tid + 64];
+        smem2[tid] = val2 += smem2[tid + 64];
        smem3[tid] = val3 += smem3[tid + 64];
-    } 
+    }
    barrier(CLK_LOCAL_MEM_FENCE);

    if (tid < 32)
@@ -231,28 +231,28 @@ void reduce3(float val1, float val2, float val3, __local float* smem1, __local f
        volatile __local float* vmem2 = smem2;
        volatile __local float* vmem3 = smem3;

-        vmem1[tid] = val1 += vmem1[tid + 32]; 
-        vmem2[tid] = val2 += vmem2[tid + 32]; 
+        vmem1[tid] = val1 += vmem1[tid + 32];
+        vmem2[tid] = val2 += vmem2[tid + 32];
        vmem3[tid] = val3 += vmem3[tid + 32];

-        vmem1[tid] = val1 += vmem1[tid + 16]; 
-        vmem2[tid] = val2 += vmem2[tid + 16]; 
+        vmem1[tid] = val1 += vmem1[tid + 16];
+        vmem2[tid] = val2 += vmem2[tid + 16];
        vmem3[tid] = val3 += vmem3[tid + 16];

-        vmem1[tid] = val1 += vmem1[tid + 8]; 
-        vmem2[tid] = val2 += vmem2[tid + 8]; 
+        vmem1[tid] = val1 += vmem1[tid + 8];
+        vmem2[tid] = val2 += vmem2[tid + 8];
        vmem3[tid] = val3 += vmem3[tid + 8];

-        vmem1[tid] = val1 += vmem1[tid + 4]; 
-        vmem2[tid] = val2 += vmem2[tid + 4]; 
+        vmem1[tid] = val1 += vmem1[tid + 4];
+        vmem2[tid] = val2 += vmem2[tid + 4];
        vmem3[tid] = val3 += vmem3[tid + 4];

-        vmem1[tid] = val1 += vmem1[tid + 2]; 
-        vmem2[tid] = val2 += vmem2[tid + 2]; 
+        vmem1[tid] = val1 += vmem1[tid + 2];
+        vmem2[tid] = val2 += vmem2[tid + 2];
        vmem3[tid] = val3 += vmem3[tid + 2];

-        vmem1[tid] = val1 += vmem1[tid + 1]; 
-        vmem2[tid] = val2 += vmem2[tid + 1]; 
+        vmem1[tid] = val1 += vmem1[tid + 1];
+        vmem2[tid] = val2 += vmem2[tid + 1];
        vmem3[tid] = val3 += vmem3[tid + 1];
    }
 }
@@ -263,18 +263,18 @@ void reduce2(float val1, float val2, __local float* smem1, __local float* smem2,
    smem2[tid] = val2;
    barrier(CLK_LOCAL_MEM_FENCE);

-    if (tid < 128) 
-    { 
-        smem1[tid] = val1 += smem1[tid + 128]; 
-        smem2[tid] = val2 += smem2[tid + 128];  
-    } 
+    if (tid < 128)
+    {
+        smem1[tid] = val1 += smem1[tid + 128];
+        smem2[tid] = val2 += smem2[tid + 128];
+    }
    barrier(CLK_LOCAL_MEM_FENCE);

-    if (tid < 64) 
-    { 
-        smem1[tid] = val1 += smem1[tid + 64]; 
-        smem2[tid] = val2 += smem2[tid + 64]; 
-    } 
+    if (tid < 64)
+    {
+        smem1[tid] = val1 += smem1[tid + 64];
+        smem2[tid] = val2 += smem2[tid + 64];
+    }
    barrier(CLK_LOCAL_MEM_FENCE);

    if (tid < 32)
@@ -282,23 +282,23 @@ void reduce2(float val1, float val2, __local float* smem1, __local float* smem2,
        volatile __local float* vmem1 = smem1;
        volatile __local float* vmem2 = smem2;

-        vmem1[tid] = val1 += vmem1[tid + 32]; 
-        vmem2[tid] = val2 += vmem2[tid + 32]; 
+        vmem1[tid] = val1 += vmem1[tid + 32];
+        vmem2[tid] = val2 += vmem2[tid + 32];

-        vmem1[tid] = val1 += vmem1[tid + 16]; 
-        vmem2[tid] = val2 += vmem2[tid + 16]; 
+        vmem1[tid] = val1 += vmem1[tid + 16];
+        vmem2[tid] = val2 += vmem2[tid + 16];

-        vmem1[tid] = val1 += vmem1[tid + 8]; 
-        vmem2[tid] = val2 += vmem2[tid + 8]; 
+        vmem1[tid] = val1 += vmem1[tid + 8];
+        vmem2[tid] = val2 += vmem2[tid + 8];

-        vmem1[tid] = val1 += vmem1[tid + 4]; 
-        vmem2[tid] = val2 += vmem2[tid + 4]; 
+        vmem1[tid] = val1 += vmem1[tid + 4];
+        vmem2[tid] = val2 += vmem2[tid + 4];

-        vmem1[tid] = val1 += vmem1[tid + 2]; 
-        vmem2[tid] = val2 += vmem2[tid + 2]; 
+        vmem1[tid] = val1 += vmem1[tid + 2];
+        vmem2[tid] = val2 += vmem2[tid + 2];

-        vmem1[tid] = val1 += vmem1[tid + 1]; 
-        vmem2[tid] = val2 += vmem2[tid + 1]; 
+        vmem1[tid] = val1 += vmem1[tid + 1];
+        vmem2[tid] = val2 += vmem2[tid + 1];
    }
 }

@@ -307,28 +307,28 @@ void reduce1(float val1, __local float* smem1, int tid)
    smem1[tid] = val1;
    barrier(CLK_LOCAL_MEM_FENCE);

-    if (tid < 128) 
-    { 
-        smem1[tid] = val1 += smem1[tid + 128]; 
-    } 
+    if (tid < 128)
+    {
+        smem1[tid] = val1 += smem1[tid + 128];
+    }
    barrier(CLK_LOCAL_MEM_FENCE);

-    if (tid < 64) 
-    { 
-        smem1[tid] = val1 += smem1[tid + 64]; 
-    } 
+    if (tid < 64)
+    {
+        smem1[tid] = val1 += smem1[tid + 64];
+    }
    barrier(CLK_LOCAL_MEM_FENCE);

    if (tid < 32)
    {
        volatile __local float* vmem1 = smem1;

-        vmem1[tid] = val1 += vmem1[tid + 32]; 
-        vmem1[tid] = val1 += vmem1[tid + 16]; 
-        vmem1[tid] = val1 += vmem1[tid + 8]; 
+        vmem1[tid] = val1 += vmem1[tid + 32];
+        vmem1[tid] = val1 += vmem1[tid + 16];
+        vmem1[tid] = val1 += vmem1[tid + 8];
        vmem1[tid] = val1 += vmem1[tid + 4];
-        vmem1[tid] = val1 += vmem1[tid + 2]; 
-        vmem1[tid] = val1 += vmem1[tid + 1]; 
+        vmem1[tid] = val1 += vmem1[tid + 2];
+        vmem1[tid] = val1 += vmem1[tid + 1];
    }
 }

@@ -344,8 +344,8 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
    __local float smem2[256];
    __local float smem3[256];

-	int c_halfWin_x = (c_winSize_x - 1) / 2;
-	int c_halfWin_y = (c_winSize_y - 1) / 2;
+    int c_halfWin_x = (c_winSize_x - 1) / 2;
+    int c_halfWin_y = (c_winSize_y - 1) / 2;

    const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0);

@@ -359,18 +359,18 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
        {
            status[get_group_id(0)] = 0;

-            //if (calcErr) 
+            //if (calcErr)
            //    err[get_group_id(0)] = 0;
        }

        return;
    }
-    
+
    prevPt.x -= c_halfWin_x;
    prevPt.y -= c_halfWin_y;
-    
+
    // extract the patch from the first image, compute covariation matrix of derivatives
-    
+
    float A11 = 0;
    float A12 = 0;
    float A22 = 0;
@@ -380,14 +380,14 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
    float dIdy_patch[21][21];

    for (int yBase = get_local_id(1), i = 0; yBase < c_winSize_y; yBase += get_local_size(1), ++i)
-    {                
+    {
        for (int xBase = get_local_id(0), j = 0; xBase < c_winSize_x; xBase += get_local_size(0), ++j)
        {
            float x = (prevPt.x + xBase + 0.5f);
            float y = (prevPt.y + yBase + 0.5f);

            I_patch[i][j] = read_imagef(I, sampler, (float2)(x, y)).x;
-            
+
            float dIdx = 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x + 1, y)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)).x -
                             (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x - 1, y)).x + 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)).x);

@@ -396,7 +396,7 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,

            dIdx_patch[i][j] = dIdx;
            dIdy_patch[i][j] = dIdy;
-            
+
            A11 += dIdx * dIdx;
            A12 += dIdx * dIdy;
            A22 += dIdy * dIdy;
@@ -409,10 +409,10 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
    A11 = smem1[0];
    A12 = smem2[0];
    A22 = smem3[0];
-    
+
    float D = A11 * A22 - A12 * A12;

-    //if (calcErr && GET_MIN_EIGENVALS && tid == 0) 
+    //if (calcErr && GET_MIN_EIGENVALS && tid == 0)
    //    err[get_group_id(0)] = minEig;

    if (D < 1.192092896e-07f)
@@ -431,8 +431,8 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,

    float2 nextPt = nextPts[get_group_id(0)];
    nextPt.x *= 2.0f;
-    nextPt.y *= 2.0f; 
-    
+    nextPt.y *= 2.0f;
+
    nextPt.x -= c_halfWin_x;
    nextPt.y -= c_halfWin_y;

@@ -447,14 +447,14 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,

        float b1 = 0;
        float b2 = 0;
-        
+
        for (int y = get_local_id(1), i = 0; y < c_winSize_y; y += get_local_size(1), ++i)
        {
            for (int x = get_local_id(0), j = 0; x < c_winSize_x; x += get_local_size(0), ++j)
            {
-				float a = (nextPt.x + x + 0.5f);
-				float b = (nextPt.y + y + 0.5f);
-				
+                float a = (nextPt.x + x + 0.5f);
+                float b = (nextPt.y + y + 0.5f);
+
                float I_val = I_patch[i][j];
                float J_val = read_imagef(J, sampler, (float2)(a, b)).x;

@@ -464,7 +464,7 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
                b2 += diff * dIdy_patch[i][j];
            }
        }
-        
+
        reduce2(b1, b2, smem1, smem2, tid);
        barrier(CLK_LOCAL_MEM_FENCE);

@@ -474,7 +474,7 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
        float2 delta;
        delta.x = A12 * b2 - A22 * b1;
        delta.y = A12 * b1 - A11 * b2;
-            
+
        nextPt.x += delta.x;
        nextPt.y += delta.y;

@@ -489,9 +489,9 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
        {
            for (int x = get_local_id(0), j = 0; x < c_winSize_x; x += get_local_size(0), ++j)
            {
-				float a = (nextPt.x + x + 0.5f);
-				float b = (nextPt.y + y + 0.5f);
-				
+                float a = (nextPt.x + x + 0.5f);
+                float b = (nextPt.y + y + 0.5f);
+
                float I_val = I_patch[i][j];
                float J_val = read_imagef(J, sampler, (float2)(a, b)).x;

@@ -522,8 +522,8 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
    __local float smem2[256];
    __local float smem3[256];

-	int c_halfWin_x = (c_winSize_x - 1) / 2;
-	int c_halfWin_y = (c_winSize_y - 1) / 2;
+    int c_halfWin_x = (c_winSize_x - 1) / 2;
+    int c_halfWin_y = (c_winSize_y - 1) / 2;

    const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0);

@@ -537,18 +537,18 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
        {
            status[get_group_id(0)] = 0;

-            //if (calcErr) 
+            //if (calcErr)
            //    err[get_group_id(0)] = 0;
        }

        return;
    }
-    
+
    prevPt.x -= c_halfWin_x;
    prevPt.y -= c_halfWin_y;
-    
+
    // extract the patch from the first image, compute covariation matrix of derivatives
-    
+
    float A11 = 0;
    float A12 = 0;
    float A22 = 0;
@@ -558,14 +558,14 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
    float4 dIdy_patch[21][21];

    for (int yBase = get_local_id(1), i = 0; yBase < c_winSize_y; yBase += get_local_size(1), ++i)
-    {                
+    {
        for (int xBase = get_local_id(0), j = 0; xBase < c_winSize_x; xBase += get_local_size(0), ++j)
        {
            float x = (prevPt.x + xBase + 0.5f);
            float y = (prevPt.y + yBase + 0.5f);

            I_patch[i][j] = read_imagef(I, sampler, (float2)(x, y)).x;
-            
+
            float4 dIdx = 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x + 1, y)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)).x -
                             (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x - 1, y)).x + 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)).x);

@@ -574,7 +574,7 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,

            dIdx_patch[i][j] = dIdx;
            dIdy_patch[i][j] = dIdy;
-            
+
            A11 += (dIdx * dIdx).x + (dIdx * dIdx).y + (dIdx * dIdx).z;
            A12 += (dIdx * dIdy).x + (dIdx * dIdy).y + (dIdx * dIdy).z;
            A22 += (dIdy * dIdy).x + (dIdy * dIdy).y + (dIdy * dIdy).z;
@@ -587,10 +587,10 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
    A11 = smem1[0];
    A12 = smem2[0];
    A22 = smem3[0];
-    
+
    float D = A11 * A22 - A12 * A12;

-    //if (calcErr && GET_MIN_EIGENVALS && tid == 0) 
+    //if (calcErr && GET_MIN_EIGENVALS && tid == 0)
    //    err[get_group_id(0)] = minEig;

    if (D < 1.192092896e-07f)
@@ -609,8 +609,8 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,

    float2 nextPt = nextPts[get_group_id(0)];
    nextPt.x *= 2.0f;
-    nextPt.y *= 2.0f; 
-    
+    nextPt.y *= 2.0f;
+
    nextPt.x -= c_halfWin_x;
    nextPt.y -= c_halfWin_y;

@@ -625,14 +625,14 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,

        float b1 = 0;
        float b2 = 0;
-        
+
        for (int y = get_local_id(1), i = 0; y < c_winSize_y; y += get_local_size(1), ++i)
        {
            for (int x = get_local_id(0), j = 0; x < c_winSize_x; x += get_local_size(0), ++j)
            {
-				float a = (nextPt.x + x + 0.5f);
-				float b = (nextPt.y + y + 0.5f);
-				
+                float a = (nextPt.x + x + 0.5f);
+                float b = (nextPt.y + y + 0.5f);
+
                float4 I_val = I_patch[i][j];
                float4 J_val = read_imagef(J, sampler, (float2)(a, b)).x;

@@ -642,7 +642,7 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
                b2 += (diff * dIdy_patch[i][j]).x + (diff * dIdy_patch[i][j]).y + (diff * dIdy_patch[i][j]).z;
            }
        }
-        
+
        reduce2(b1, b2, smem1, smem2, tid);
        barrier(CLK_LOCAL_MEM_FENCE);

@@ -652,7 +652,7 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
        float2 delta;
        delta.x = A12 * b2 - A22 * b1;
        delta.y = A12 * b1 - A11 * b2;
-            
+
        nextPt.x += delta.x;
        nextPt.y += delta.y;

@@ -667,9 +667,9 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
        {
            for (int x = get_local_id(0), j = 0; x < c_winSize_x; x += get_local_size(0), ++j)
            {
-				float a = (nextPt.x + x + 0.5f);
-				float b = (nextPt.y + y + 0.5f);
-				
+                float a = (nextPt.x + x + 0.5f);
+                float b = (nextPt.y + y + 0.5f);
+
                float4 I_val = I_patch[i][j];
                float4 J_val = read_imagef(J, sampler, (float2)(a, b)).x;

@@ -694,11 +694,11 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
    }
 }

-__kernel void lkDense_C1_D0(image2d_t I, image2d_t J, __global float* u, int uStep, __global float* v, int vStep, __global const float* prevU, int prevUStep, __global const float* prevV, int prevVStep, 
+__kernel void lkDense_C1_D0(image2d_t I, image2d_t J, __global float* u, int uStep, __global float* v, int vStep, __global const float* prevU, int prevUStep, __global const float* prevV, int prevVStep,
    const int rows, const int cols, /*__global float* err, int errStep, int cn,*/ int c_winSize_x, int c_winSize_y, int c_iters, char calcErr)
 {
-	int c_halfWin_x = (c_winSize_x - 1) / 2;
-	int c_halfWin_y = (c_winSize_y - 1) / 2;
+    int c_halfWin_x = (c_winSize_x - 1) / 2;
+    int c_halfWin_y = (c_winSize_y - 1) / 2;

    const int patchWidth  = get_local_size(0) + 2 * c_halfWin_x;
    const int patchHeight = get_local_size(1) + 2 * c_halfWin_y;
@@ -712,8 +712,8 @@ __kernel void lkDense_C1_D0(image2d_t I, image2d_t J, __global float* u, int uSt
    const int xBase = get_group_id(0) * get_local_size(0);
    const int yBase = get_group_id(1) * get_local_size(1);

-	sampler_t sampleri    = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
-	
+    sampler_t sampleri    = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
+
    for (int i = get_local_id(1); i < patchHeight; i += get_local_size(1))
    {
        for (int j = get_local_id(0); j < patchWidth; j += get_local_size(0))
@@ -735,7 +735,7 @@ __kernel void lkDense_C1_D0(image2d_t I, image2d_t J, __global float* u, int uSt
    barrier(CLK_LOCAL_MEM_FENCE);

    // extract the patch from the first image, compute covariation matrix of derivatives
-    
+
    const int x = get_global_id(0);
    const int y = get_global_id(1);

@@ -747,24 +747,24 @@ __kernel void lkDense_C1_D0(image2d_t I, image2d_t J, __global float* u, int uSt
    int A22i = 0;

    for (int i = 0; i < c_winSize_y; ++i)
-    {                
+    {
        for (int j = 0; j < c_winSize_x; ++j)
        {
            int dIdx = dIdx_patch[(get_local_id(1) + i) * patchWidth + (get_local_id(0) + j)];
            int dIdy = dIdy_patch[(get_local_id(1) + i) * patchWidth + (get_local_id(0) + j)];
-            
+
            A11i += dIdx * dIdx;
            A12i += dIdx * dIdy;
            A22i += dIdy * dIdy;
        }
    }
-    
+
    float A11 = A11i;
    float A12 = A12i;
    float A22 = A22i;

    float D = A11 * A22 - A12 * A12;
-    
+
    //if (calcErr && GET_MIN_EIGENVALS)
    //    (err + y * errStep)[x] = minEig;

@@ -819,7 +819,7 @@ __kernel void lkDense_C1_D0(image2d_t I, image2d_t J, __global float* u, int uSt
        float2 delta;
        delta.x = A12 * b2 - A22 * b1;
        delta.y = A12 * b1 - A11 * b2;
-            
+
        nextPt.x += delta.x;
        nextPt.y += delta.y;

--- a/modules/ocl/src/kernels/split_mat.cl
+++ b/modules/ocl/src/kernels/split_mat.cl
@@ -51,9 +51,9 @@
 ////////////vector fuction name format: split_vector_C(channels number)_D(data type depth)//////
 ////////////////////////////////////////////////////////////////////////////////////////////////
 __kernel void split_vector_C4_D0 (__global uchar *mat_src,  int src_step,  int src_offset,
-                                  __global uchar *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global uchar *mat_dst1, int dst1_step, int dst1_offset, 
-	                                __global uchar *mat_dst2, int dst2_step, int dst2_offset,  
+                                  __global uchar *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global uchar *mat_dst1, int dst1_step, int dst1_offset,
+                                    __global uchar *mat_dst2, int dst2_step, int dst2_offset,
                                  __global uchar *mat_dst3, int dst3_step, int dst3_offset,
                                  int rows, int cols, int dst_step1)

@@ -61,37 +61,37 @@ __kernel void split_vector_C4_D0 (__global uchar *mat_src,  int src_step,  int s
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
    {
        x = x << 2;

-        int src_idx  = mad24(y, src_step, src_offset + (x << 2)); 
+        int src_idx  = mad24(y, src_step, src_offset + (x << 2));

-        int dst0_start = mad24(y, dst0_step, dst0_offset); 
+        int dst0_start = mad24(y, dst0_step, dst0_offset);
        int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
        int dst0_idx   = mad24(y, dst0_step, dst0_offset + x) & (int)0xfffffffc;

-        int dst1_start = mad24(y, dst1_step, dst1_offset); 
+        int dst1_start = mad24(y, dst1_step, dst1_offset);
        int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
        int dst1_idx   = mad24(y, dst1_step, dst1_offset + x) & (int)0xfffffffc;

-        int dst2_start = mad24(y, dst2_step, dst2_offset); 
+        int dst2_start = mad24(y, dst2_step, dst2_offset);
        int dst2_end   = mad24(y, dst2_step, dst2_offset + dst_step1);
        int dst2_idx   = mad24(y, dst2_step, dst2_offset + x) & (int)0xfffffffc;

-        int dst3_start = mad24(y, dst3_step, dst3_offset); 
+        int dst3_start = mad24(y, dst3_step, dst3_offset);
        int dst3_end   = mad24(y, dst3_step, dst3_offset + dst_step1);
        int dst3_idx   = mad24(y, dst3_step, dst3_offset + x) & (int)0xfffffffc;
-           
-        uchar4 data_0 = *((global uchar4 *)(mat_src + (src_idx - 12 >= 0 ? src_idx - 12 : src_idx))); 
-        uchar4 data_1 = *((global uchar4 *)(mat_src + (src_idx - 8  >= 0 ? src_idx - 8  : src_idx))); 
-        uchar4 data_2 = *((global uchar4 *)(mat_src + (src_idx - 4  >= 0 ? src_idx - 4  : src_idx))); 
-        uchar4 data_3 = *((global uchar4 *)(mat_src + src_idx + 0 )); 

-        int total_bytes = src_offset + rows * src_step; 
-        uchar4 data_4 = *((global uchar4 *)(mat_src + (src_idx + 4  < total_bytes ? src_idx + 4  : src_idx))); 
-        uchar4 data_5 = *((global uchar4 *)(mat_src + (src_idx + 8  < total_bytes ? src_idx + 8  : src_idx))); 
-        uchar4 data_6 = *((global uchar4 *)(mat_src + (src_idx + 12 < total_bytes ? src_idx + 12 : src_idx)));  
+        uchar4 data_0 = *((global uchar4 *)(mat_src + (src_idx - 12 >= 0 ? src_idx - 12 : src_idx)));
+        uchar4 data_1 = *((global uchar4 *)(mat_src + (src_idx - 8  >= 0 ? src_idx - 8  : src_idx)));
+        uchar4 data_2 = *((global uchar4 *)(mat_src + (src_idx - 4  >= 0 ? src_idx - 4  : src_idx)));
+        uchar4 data_3 = *((global uchar4 *)(mat_src + src_idx + 0 ));
+
+        int total_bytes = src_offset + rows * src_step;
+        uchar4 data_4 = *((global uchar4 *)(mat_src + (src_idx + 4  < total_bytes ? src_idx + 4  : src_idx)));
+        uchar4 data_5 = *((global uchar4 *)(mat_src + (src_idx + 8  < total_bytes ? src_idx + 8  : src_idx)));
+        uchar4 data_6 = *((global uchar4 *)(mat_src + (src_idx + 12 < total_bytes ? src_idx + 12 : src_idx)));

        uchar4 tmp_data0=1, tmp_data1=2, tmp_data2, tmp_data3;

@@ -164,33 +164,33 @@ __kernel void split_vector_C4_D0 (__global uchar *mat_src,  int src_step,  int s
 }

 __kernel void split_vector_C3_D0 (__global uchar *mat_src,  int src_step,  int src_offset,
-                                  __global uchar *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global uchar *mat_dst1, int dst1_step, int dst1_offset, 
-	                                __global uchar *mat_dst2, int dst2_step, int dst2_offset,  
+                                  __global uchar *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global uchar *mat_dst1, int dst1_step, int dst1_offset,
+                                    __global uchar *mat_dst2, int dst2_step, int dst2_offset,
                                  int rows, int cols, int dst_step1)

 {
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
    {
        x = x << 2;

-        int src_idx  = mad24(y, src_step, src_offset); 
+        int src_idx  = mad24(y, src_step, src_offset);

-        int dst0_start = mad24(y, dst0_step, dst0_offset); 
+        int dst0_start = mad24(y, dst0_step, dst0_offset);
        int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
        int dst0_idx   = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);

-        int dst1_start = mad24(y, dst1_step, dst1_offset); 
+        int dst1_start = mad24(y, dst1_step, dst1_offset);
        int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
        int dst1_idx   = mad24(y, dst1_step, dst1_offset + x  & (int)0xfffffffc);

-        int dst2_start = mad24(y, dst2_step, dst2_offset); 
+        int dst2_start = mad24(y, dst2_step, dst2_offset);
        int dst2_end   = mad24(y, dst2_step, dst2_offset + dst_step1);
        int dst2_idx   = mad24(y, dst2_step, dst2_offset + x & (int)0xfffffffc);
-           
+
        uchar4 dst0_data  = *((__global uchar4 *)(mat_dst0 + dst0_idx));
        uchar4 dst1_data  = *((__global uchar4 *)(mat_dst1 + dst1_idx));
        uchar4 dst2_data  = *((__global uchar4 *)(mat_dst2 + dst2_idx));
@@ -227,10 +227,10 @@ __kernel void split_vector_C3_D0 (__global uchar *mat_src,  int src_step,  int s

        uchar data[7] = {src_data_0, src_data_3, src_data_6, src_data_9, src_data_12, src_data_15, src_data_18};
        int index = 3 - dst0_offset & 3;
-        tmp_data0 = (uchar4)(data[index], data[index + 1], data[index + 2], data[index + 3]); 
+        tmp_data0 = (uchar4)(data[index], data[index + 1], data[index + 2], data[index + 3]);

        uchar4 data0, data1, data2;
-        
+
        data0     = (uchar4)(src_data_1, src_data_4, src_data_7, src_data_10);
        data1     = (dst1_offset & 3) == 2 ? (uchar4)(src_data_4, src_data_7, src_data_10, src_data_13)  : data0;
        data2     = (dst1_offset & 3) == 1 ? (uchar4)(src_data_7, src_data_10, src_data_13, src_data_16) : data1;
@@ -263,31 +263,31 @@ __kernel void split_vector_C3_D0 (__global uchar *mat_src,  int src_step,  int s
 }

 __kernel void split_vector_C2_D0 (__global uchar *mat_src,  int src_step,  int src_offset,
-                                  __global uchar *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global uchar *mat_dst1, int dst1_step, int dst1_offset, 
+                                  __global uchar *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global uchar *mat_dst1, int dst1_step, int dst1_offset,
                                  int rows, int cols, int dst_step1)

 {
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
    {
        x = x << 2;

        #define dst0_align ((dst0_offset & 3) << 1)
        #define dst1_align ((dst1_offset & 3) << 1)
-        int src_idx_0  = mad24(y, src_step, src_offset - dst0_align + (x << 1)); 
-        int src_idx_1  = mad24(y, src_step, src_offset - dst1_align + (x << 1)); 
+        int src_idx_0  = mad24(y, src_step, src_offset - dst0_align + (x << 1));
+        int src_idx_1  = mad24(y, src_step, src_offset - dst1_align + (x << 1));

-        int dst0_start = mad24(y, dst0_step, dst0_offset); 
+        int dst0_start = mad24(y, dst0_step, dst0_offset);
        int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
        int dst0_idx   = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);

-        int dst1_start = mad24(y, dst1_step, dst1_offset); 
+        int dst1_start = mad24(y, dst1_step, dst1_offset);
        int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
        int dst1_idx   = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
-           
+
        uchar8 src_data_0 = vload8(0, mat_src + src_idx_0);
        uchar8 src_data_1 = vload8(0, mat_src + src_idx_1);

@@ -312,9 +312,9 @@ __kernel void split_vector_C2_D0 (__global uchar *mat_src,  int src_step,  int s
 }

 __kernel void split_vector_C4_D1 (__global char *mat_src,  int src_step,  int src_offset,
-                                  __global char *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global char *mat_dst1, int dst1_step, int dst1_offset, 
-	                                __global char *mat_dst2, int dst2_step, int dst2_offset,  
+                                  __global char *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global char *mat_dst1, int dst1_step, int dst1_offset,
+                                    __global char *mat_dst2, int dst2_step, int dst2_offset,
                                  __global char *mat_dst3, int dst3_step, int dst3_offset,
                                  int rows, int cols, int dst_step1)

@@ -322,35 +322,35 @@ __kernel void split_vector_C4_D1 (__global char *mat_src,  int src_step,  int sr
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
    {
        x = x << 2;

-        int src_idx  = mad24(y, src_step, src_offset + (x << 2)); 
+        int src_idx  = mad24(y, src_step, src_offset + (x << 2));

-        int dst0_start = mad24(y, dst0_step, dst0_offset); 
+        int dst0_start = mad24(y, dst0_step, dst0_offset);
        int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
        int dst0_idx   = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);

-        int dst1_start = mad24(y, dst1_step, dst1_offset); 
+        int dst1_start = mad24(y, dst1_step, dst1_offset);
        int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
        int dst1_idx   = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);

-        int dst2_start = mad24(y, dst2_step, dst2_offset); 
+        int dst2_start = mad24(y, dst2_step, dst2_offset);
        int dst2_end   = mad24(y, dst2_step, dst2_offset + dst_step1);
        int dst2_idx   = mad24(y, dst2_step, dst2_offset + x & (int)0xfffffffc);

-        int dst3_start = mad24(y, dst3_step, dst3_offset); 
+        int dst3_start = mad24(y, dst3_step, dst3_offset);
        int dst3_end   = mad24(y, dst3_step, dst3_offset + dst_step1);
        int dst3_idx   = mad24(y, dst3_step, dst3_offset + x & (int)0xfffffffc);
-           
-        char4 data_0 = *((global char4 *)(mat_src + src_idx - 12)); 
-        char4 data_1 = *((global char4 *)(mat_src + src_idx - 8 )); 
-        char4 data_2 = *((global char4 *)(mat_src + src_idx - 4 )); 
-        char4 data_3 = *((global char4 *)(mat_src + src_idx + 0 )); 
-        char4 data_4 = *((global char4 *)(mat_src + src_idx + 4 )); 
-        char4 data_5 = *((global char4 *)(mat_src + src_idx + 8 )); 
-        char4 data_6 = *((global char4 *)(mat_src + src_idx + 12)); 
+
+        char4 data_0 = *((global char4 *)(mat_src + src_idx - 12));
+        char4 data_1 = *((global char4 *)(mat_src + src_idx - 8 ));
+        char4 data_2 = *((global char4 *)(mat_src + src_idx - 4 ));
+        char4 data_3 = *((global char4 *)(mat_src + src_idx + 0 ));
+        char4 data_4 = *((global char4 *)(mat_src + src_idx + 4 ));
+        char4 data_5 = *((global char4 *)(mat_src + src_idx + 8 ));
+        char4 data_6 = *((global char4 *)(mat_src + src_idx + 12));

        char4 tmp_data0=1, tmp_data1=2, tmp_data2, tmp_data3;

@@ -423,33 +423,33 @@ __kernel void split_vector_C4_D1 (__global char *mat_src,  int src_step,  int sr
 }

 __kernel void split_vector_C3_D1 (__global char *mat_src,  int src_step,  int src_offset,
-                                  __global char *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global char *mat_dst1, int dst1_step, int dst1_offset, 
-	                                __global char *mat_dst2, int dst2_step, int dst2_offset,  
+                                  __global char *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global char *mat_dst1, int dst1_step, int dst1_offset,
+                                    __global char *mat_dst2, int dst2_step, int dst2_offset,
                                  int rows, int cols, int dst_step1)

 {
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
    {
        x = x << 2;

-        int src_idx  = mad24(y, src_step, src_offset); 
+        int src_idx  = mad24(y, src_step, src_offset);

-        int dst0_start = mad24(y, dst0_step, dst0_offset); 
+        int dst0_start = mad24(y, dst0_step, dst0_offset);
        int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
        int dst0_idx   = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);

-        int dst1_start = mad24(y, dst1_step, dst1_offset); 
+        int dst1_start = mad24(y, dst1_step, dst1_offset);
        int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
        int dst1_idx   = mad24(y, dst1_step, dst1_offset + x  & (int)0xfffffffc);

-        int dst2_start = mad24(y, dst2_step, dst2_offset); 
+        int dst2_start = mad24(y, dst2_step, dst2_offset);
        int dst2_end   = mad24(y, dst2_step, dst2_offset + dst_step1);
        int dst2_idx   = mad24(y, dst2_step, dst2_offset + x & (int)0xfffffffc);
-           
+
        char4 dst0_data  = *((__global char4 *)(mat_dst0 + dst0_idx));
        char4 dst1_data  = *((__global char4 *)(mat_dst1 + dst1_idx));
        char4 dst2_data  = *((__global char4 *)(mat_dst2 + dst2_idx));
@@ -486,10 +486,10 @@ __kernel void split_vector_C3_D1 (__global char *mat_src,  int src_step,  int sr

        char data[7] = {src_data_0, src_data_3, src_data_6, src_data_9, src_data_12, src_data_15, src_data_18};
        int index = 3 - dst0_offset & 3;
-        tmp_data0 = (char4)(data[index], data[index + 1], data[index + 2], data[index + 3]); 
+        tmp_data0 = (char4)(data[index], data[index + 1], data[index + 2], data[index + 3]);

        char4 data0, data1, data2;
-        
+
        data0     = (char4)(src_data_1, src_data_4, src_data_7, src_data_10);
        data1     = (dst1_offset & 3) == 2 ? (char4)(src_data_4, src_data_7, src_data_10, src_data_13)  : data0;
        data2     = (dst1_offset & 3) == 1 ? (char4)(src_data_7, src_data_10, src_data_13, src_data_16) : data1;
@@ -522,31 +522,31 @@ __kernel void split_vector_C3_D1 (__global char *mat_src,  int src_step,  int sr
 }

 __kernel void split_vector_C2_D1 (__global char *mat_src,  int src_step,  int src_offset,
-                                  __global char *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global char *mat_dst1, int dst1_step, int dst1_offset, 
+                                  __global char *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global char *mat_dst1, int dst1_step, int dst1_offset,
                                  int rows, int cols, int dst_step1)

 {
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
    {
        x = x << 2;

        #define dst0_align ((dst0_offset & 3) << 1)
        #define dst1_align ((dst1_offset & 3) << 1)
-        int src_idx_0  = mad24(y, src_step, src_offset - dst0_align + (x << 1)); 
-        int src_idx_1  = mad24(y, src_step, src_offset - dst1_align + (x << 1)); 
+        int src_idx_0  = mad24(y, src_step, src_offset - dst0_align + (x << 1));
+        int src_idx_1  = mad24(y, src_step, src_offset - dst1_align + (x << 1));

-        int dst0_start = mad24(y, dst0_step, dst0_offset); 
+        int dst0_start = mad24(y, dst0_step, dst0_offset);
        int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
        int dst0_idx   = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);

-        int dst1_start = mad24(y, dst1_step, dst1_offset); 
+        int dst1_start = mad24(y, dst1_step, dst1_offset);
        int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
        int dst1_idx   = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
-           
+
        char8 src_data_0 = vload8(0, mat_src + src_idx_0);
        char8 src_data_1 = vload8(0, mat_src + src_idx_1);

@@ -571,9 +571,9 @@ __kernel void split_vector_C2_D1 (__global char *mat_src,  int src_step,  int sr
 }

 __kernel void split_vector_C4_D2 (__global ushort *mat_src,  int src_step,  int src_offset,
-                                  __global ushort *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global ushort *mat_dst1, int dst1_step, int dst1_offset, 
-	                                __global ushort *mat_dst2, int dst2_step, int dst2_offset,  
+                                  __global ushort *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global ushort *mat_dst1, int dst1_step, int dst1_offset,
+                                    __global ushort *mat_dst2, int dst2_step, int dst2_offset,
                                  __global ushort *mat_dst3, int dst3_step, int dst3_offset,
                                  int rows, int cols, int dst_step1)

@@ -581,29 +581,29 @@ __kernel void split_vector_C4_D2 (__global ushort *mat_src,  int src_step,  int
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
    {
        x = x << 1;

-        int src_idx_0  = mad24(y, src_step, src_offset + (x << 3) - 8); 
-        int src_idx_1  = mad24(y, src_step, src_offset + (x << 3) + 8); 
+        int src_idx_0  = mad24(y, src_step, src_offset + (x << 3) - 8);
+        int src_idx_1  = mad24(y, src_step, src_offset + (x << 3) + 8);

-        int dst0_start = mad24(y, dst0_step, dst0_offset); 
+        int dst0_start = mad24(y, dst0_step, dst0_offset);
        int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
        int dst0_idx   = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);

-        int dst1_start = mad24(y, dst1_step, dst1_offset); 
+        int dst1_start = mad24(y, dst1_step, dst1_offset);
        int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
        int dst1_idx   = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);

-        int dst2_start = mad24(y, dst2_step, dst2_offset); 
+        int dst2_start = mad24(y, dst2_step, dst2_offset);
        int dst2_end   = mad24(y, dst2_step, dst2_offset + dst_step1);
        int dst2_idx   = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);

-        int dst3_start = mad24(y, dst3_step, dst3_offset); 
+        int dst3_start = mad24(y, dst3_step, dst3_offset);
        int dst3_end   = mad24(y, dst3_step, dst3_offset + dst_step1);
        int dst3_idx   = mad24(y, dst3_step, dst3_offset + (x << 1) & (int)0xfffffffc);
-           
+
        ushort8 src_data0 = vload8(0, (__global ushort *)((__global char *)mat_src + src_idx_0));
        ushort4 src_data1 = *((__global ushort4 *)((__global char *)mat_src + src_idx_1));

@@ -639,33 +639,33 @@ __kernel void split_vector_C4_D2 (__global ushort *mat_src,  int src_step,  int
 }

 __kernel void split_vector_C3_D2 (__global ushort *mat_src,  int src_step,  int src_offset,
-                                  __global ushort *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global ushort *mat_dst1, int dst1_step, int dst1_offset, 
-	                                __global ushort *mat_dst2, int dst2_step, int dst2_offset,  
+                                  __global ushort *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global ushort *mat_dst1, int dst1_step, int dst1_offset,
+                                    __global ushort *mat_dst2, int dst2_step, int dst2_offset,
                                  int rows, int cols, int dst_step1)

 {
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
    {
        x = x << 1;

-        int src_idx  = mad24(y, src_step, src_offset); 
+        int src_idx  = mad24(y, src_step, src_offset);

-        int dst0_start = mad24(y, dst0_step, dst0_offset); 
+        int dst0_start = mad24(y, dst0_step, dst0_offset);
        int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
        int dst0_idx   = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);

-        int dst1_start = mad24(y, dst1_step, dst1_offset); 
+        int dst1_start = mad24(y, dst1_step, dst1_offset);
        int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
        int dst1_idx   = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);

-        int dst2_start = mad24(y, dst2_step, dst2_offset); 
+        int dst2_start = mad24(y, dst2_step, dst2_offset);
        int dst2_end   = mad24(y, dst2_step, dst2_offset + dst_step1);
        int dst2_idx   = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
-           
+
        ushort2 dst0_data  = *((__global ushort2 *)((__global char *)mat_dst0 + dst0_idx));
        ushort2 dst1_data  = *((__global ushort2 *)((__global char *)mat_dst1 + dst1_idx));
        ushort2 dst2_data  = *((__global ushort2 *)((__global char *)mat_dst2 + dst2_idx));
@@ -702,31 +702,31 @@ __kernel void split_vector_C3_D2 (__global ushort *mat_src,  int src_step,  int
 }

 __kernel void split_vector_C2_D2 (__global ushort *mat_src,  int src_step,  int src_offset,
-                                  __global ushort *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global ushort *mat_dst1, int dst1_step, int dst1_offset, 
+                                  __global ushort *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global ushort *mat_dst1, int dst1_step, int dst1_offset,
                                  int rows, int cols, int dst_step1)

 {
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
    {
        x = x << 1;

        #define dst0_align ((dst0_offset & 3) << 1)
        #define dst1_align ((dst1_offset & 3) << 1)
-        int src_idx_0  = mad24(y, src_step, src_offset - dst0_align + (x << 2)); 
-        int src_idx_1  = mad24(y, src_step, src_offset - dst1_align + (x << 2)); 
+        int src_idx_0  = mad24(y, src_step, src_offset - dst0_align + (x << 2));
+        int src_idx_1  = mad24(y, src_step, src_offset - dst1_align + (x << 2));

-        int dst0_start = mad24(y, dst0_step, dst0_offset); 
+        int dst0_start = mad24(y, dst0_step, dst0_offset);
        int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
        int dst0_idx   = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);

-        int dst1_start = mad24(y, dst1_step, dst1_offset); 
+        int dst1_start = mad24(y, dst1_step, dst1_offset);
        int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
        int dst1_idx   = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
-           
+
        ushort4 src_data_0 = vload4(0, (__global ushort *)((__global char *)mat_src + src_idx_0));
        ushort4 src_data_1 = vload4(0, (__global ushort *)((__global char *)mat_src + src_idx_1));

@@ -746,9 +746,9 @@ __kernel void split_vector_C2_D2 (__global ushort *mat_src,  int src_step,  int
    }
 }
 __kernel void split_vector_C4_D3 (__global short *mat_src,  int src_step,  int src_offset,
-                                  __global short *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global short *mat_dst1, int dst1_step, int dst1_offset, 
-	                                __global short *mat_dst2, int dst2_step, int dst2_offset,  
+                                  __global short *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global short *mat_dst1, int dst1_step, int dst1_offset,
+                                    __global short *mat_dst2, int dst2_step, int dst2_offset,
                                  __global short *mat_dst3, int dst3_step, int dst3_offset,
                                  int rows, int cols, int dst_step1)

@@ -756,29 +756,29 @@ __kernel void split_vector_C4_D3 (__global short *mat_src,  int src_step,  int s
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
    {
        x = x << 1;

-        int src_idx_0  = mad24(y, src_step, src_offset + (x << 3) - 8); 
-        int src_idx_1  = mad24(y, src_step, src_offset + (x << 3) + 8); 
+        int src_idx_0  = mad24(y, src_step, src_offset + (x << 3) - 8);
+        int src_idx_1  = mad24(y, src_step, src_offset + (x << 3) + 8);

-        int dst0_start = mad24(y, dst0_step, dst0_offset); 
+        int dst0_start = mad24(y, dst0_step, dst0_offset);
        int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
        int dst0_idx   = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);

-        int dst1_start = mad24(y, dst1_step, dst1_offset); 
+        int dst1_start = mad24(y, dst1_step, dst1_offset);
        int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
        int dst1_idx   = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);

-        int dst2_start = mad24(y, dst2_step, dst2_offset); 
+        int dst2_start = mad24(y, dst2_step, dst2_offset);
        int dst2_end   = mad24(y, dst2_step, dst2_offset + dst_step1);
        int dst2_idx   = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);

-        int dst3_start = mad24(y, dst3_step, dst3_offset); 
+        int dst3_start = mad24(y, dst3_step, dst3_offset);
        int dst3_end   = mad24(y, dst3_step, dst3_offset + dst_step1);
        int dst3_idx   = mad24(y, dst3_step, dst3_offset + (x << 1) & (int)0xfffffffc);
-           
+
        short8 src_data0 = vload8(0, (__global short *)((__global char *)mat_src + src_idx_0));
        short4 src_data1 = *((__global short4 *)((__global char *)mat_src + src_idx_1));

@@ -813,33 +813,33 @@ __kernel void split_vector_C4_D3 (__global short *mat_src,  int src_step,  int s
    }
 }
 __kernel void split_vector_C3_D3 (__global short *mat_src,  int src_step,  int src_offset,
-                                  __global short *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global short *mat_dst1, int dst1_step, int dst1_offset, 
-	                                __global short *mat_dst2, int dst2_step, int dst2_offset,  
+                                  __global short *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global short *mat_dst1, int dst1_step, int dst1_offset,
+                                    __global short *mat_dst2, int dst2_step, int dst2_offset,
                                  int rows, int cols, int dst_step1)

 {
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
    {
        x = x << 1;

-        int src_idx  = mad24(y, src_step, src_offset); 
+        int src_idx  = mad24(y, src_step, src_offset);

-        int dst0_start = mad24(y, dst0_step, dst0_offset); 
+        int dst0_start = mad24(y, dst0_step, dst0_offset);
        int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
        int dst0_idx   = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);

-        int dst1_start = mad24(y, dst1_step, dst1_offset); 
+        int dst1_start = mad24(y, dst1_step, dst1_offset);
        int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
        int dst1_idx   = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);

-        int dst2_start = mad24(y, dst2_step, dst2_offset); 
+        int dst2_start = mad24(y, dst2_step, dst2_offset);
        int dst2_end   = mad24(y, dst2_step, dst2_offset + dst_step1);
        int dst2_idx   = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
-           
+
        short2 dst0_data  = *((__global short2 *)((__global char *)mat_dst0 + dst0_idx));
        short2 dst1_data  = *((__global short2 *)((__global char *)mat_dst1 + dst1_idx));
        short2 dst2_data  = *((__global short2 *)((__global char *)mat_dst2 + dst2_idx));
@@ -877,31 +877,31 @@ __kernel void split_vector_C3_D3 (__global short *mat_src,  int src_step,  int s


 __kernel void split_vector_C2_D3 (__global short *mat_src,  int src_step,  int src_offset,
-                                  __global short *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global short *mat_dst1, int dst1_step, int dst1_offset, 
+                                  __global short *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global short *mat_dst1, int dst1_step, int dst1_offset,
                                  int rows, int cols, int dst_step1)

 {
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
    {
        x = x << 1;

        #define dst0_align ((dst0_offset & 3) << 1)
        #define dst1_align ((dst1_offset & 3) << 1)
-        int src_idx_0  = mad24(y, src_step, src_offset - dst0_align + (x << 2)); 
-        int src_idx_1  = mad24(y, src_step, src_offset - dst1_align + (x << 2)); 
+        int src_idx_0  = mad24(y, src_step, src_offset - dst0_align + (x << 2));
+        int src_idx_1  = mad24(y, src_step, src_offset - dst1_align + (x << 2));

-        int dst0_start = mad24(y, dst0_step, dst0_offset); 
+        int dst0_start = mad24(y, dst0_step, dst0_offset);
        int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
        int dst0_idx   = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);

-        int dst1_start = mad24(y, dst1_step, dst1_offset); 
+        int dst1_start = mad24(y, dst1_step, dst1_offset);
        int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
        int dst1_idx   = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
-           
+
        short4 src_data_0 = vload4(0, (__global short *)((__global char *)mat_src + src_idx_0));
        short4 src_data_1 = vload4(0, (__global short *)((__global char *)mat_src + src_idx_1));

@@ -921,9 +921,9 @@ __kernel void split_vector_C2_D3 (__global short *mat_src,  int src_step,  int s
    }
 }
 __kernel void split_vector_C4_D4 (__global int *mat_src,  int src_step,  int src_offset,
-                                  __global int *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global int *mat_dst1, int dst1_step, int dst1_offset, 
-	                                __global int *mat_dst2, int dst2_step, int dst2_offset,  
+                                  __global int *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global int *mat_dst1, int dst1_step, int dst1_offset,
+                                    __global int *mat_dst2, int dst2_step, int dst2_offset,
                                  __global int *mat_dst3, int dst3_step, int dst3_offset,
                                  int rows, int cols, int dst_step1)

@@ -931,14 +931,14 @@ __kernel void split_vector_C4_D4 (__global int *mat_src,  int src_step,  int src
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
    {
-        int src_idx  = mad24(y, src_step,  src_offset); 
+        int src_idx  = mad24(y, src_step,  src_offset);
        int dst0_idx = mad24(y, dst0_step, dst0_offset);
        int dst1_idx = mad24(y, dst1_step, dst1_offset);
        int dst2_idx = mad24(y, dst2_step, dst2_offset);
        int dst3_idx = mad24(y, dst3_step, dst3_offset);
-           
+
        int4 src_data = ((__global int4 *)((__global char *)mat_src + src_idx))[x];

        ((__global int *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
@@ -948,18 +948,18 @@ __kernel void split_vector_C4_D4 (__global int *mat_src,  int src_step,  int src
    }
 }
 __kernel void split_vector_C3_D4 (__global int *mat_src,  int src_step,  int src_offset,
-                                  __global int *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global int *mat_dst1, int dst1_step, int dst1_offset, 
-	                                __global int *mat_dst2, int dst2_step, int dst2_offset,  
+                                  __global int *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global int *mat_dst1, int dst1_step, int dst1_offset,
+                                    __global int *mat_dst2, int dst2_step, int dst2_offset,
                                  int rows, int cols, int dst_step1)

 {
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
    {
-        int src_idx  = mad24(y, src_step,  src_offset); 
+        int src_idx  = mad24(y, src_step,  src_offset);
        int dst0_idx = mad24(y, dst0_step, dst0_offset);
        int dst1_idx = mad24(y, dst1_step, dst1_offset);
        int dst2_idx = mad24(y, dst2_step, dst2_offset);
@@ -975,20 +975,20 @@ __kernel void split_vector_C3_D4 (__global int *mat_src,  int src_step,  int src
 }

 __kernel void split_vector_C2_D4 (__global int *mat_src,  int src_step,  int src_offset,
-                                  __global int *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global int *mat_dst1, int dst1_step, int dst1_offset, 
+                                  __global int *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global int *mat_dst1, int dst1_step, int dst1_offset,
                                  int rows, int cols, int dst_step1)

 {
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
    {
-        int src_idx  = mad24(y, src_step,  src_offset); 
+        int src_idx  = mad24(y, src_step,  src_offset);
        int dst0_idx = mad24(y, dst0_step, dst0_offset);
        int dst1_idx = mad24(y, dst1_step, dst1_offset);
-           
+
        int2 src_data = ((__global int2 *)((__global char *)mat_src + src_idx))[x];

        ((__global int *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
@@ -997,9 +997,9 @@ __kernel void split_vector_C2_D4 (__global int *mat_src,  int src_step,  int src
 }

 __kernel void split_vector_C4_D5 (__global float *mat_src,  int src_step,  int src_offset,
-                                  __global float *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global float *mat_dst1, int dst1_step, int dst1_offset, 
-	                                __global float *mat_dst2, int dst2_step, int dst2_offset,  
+                                  __global float *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global float *mat_dst1, int dst1_step, int dst1_offset,
+                                    __global float *mat_dst2, int dst2_step, int dst2_offset,
                                  __global float *mat_dst3, int dst3_step, int dst3_offset,
                                  int rows, int cols, int dst_step1)

@@ -1007,14 +1007,14 @@ __kernel void split_vector_C4_D5 (__global float *mat_src,  int src_step,  int s
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
    {
-        int src_idx  = mad24(y, src_step,  src_offset); 
+        int src_idx  = mad24(y, src_step,  src_offset);
        int dst0_idx = mad24(y, dst0_step, dst0_offset);
        int dst1_idx = mad24(y, dst1_step, dst1_offset);
        int dst2_idx = mad24(y, dst2_step, dst2_offset);
        int dst3_idx = mad24(y, dst3_step, dst3_offset);
-           
+
        float4 src_data = ((__global float4 *)((__global char *)mat_src + src_idx))[x];

        ((__global float *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
@@ -1025,18 +1025,18 @@ __kernel void split_vector_C4_D5 (__global float *mat_src,  int src_step,  int s
 }

 __kernel void split_vector_C3_D5 (__global float *mat_src,  int src_step,  int src_offset,
-                                  __global float *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global float *mat_dst1, int dst1_step, int dst1_offset, 
-	                                __global float *mat_dst2, int dst2_step, int dst2_offset,  
+                                  __global float *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global float *mat_dst1, int dst1_step, int dst1_offset,
+                                    __global float *mat_dst2, int dst2_step, int dst2_offset,
                                  int rows, int cols, int dst_step1)

 {
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
    {
-        int src_idx  = mad24(y, src_step,  src_offset); 
+        int src_idx  = mad24(y, src_step,  src_offset);
        int dst0_idx = mad24(y, dst0_step, dst0_offset);
        int dst1_idx = mad24(y, dst1_step, dst1_offset);
        int dst2_idx = mad24(y, dst2_step, dst2_offset);
@@ -1052,20 +1052,20 @@ __kernel void split_vector_C3_D5 (__global float *mat_src,  int src_step,  int s
 }

 __kernel void split_vector_C2_D5 (__global float *mat_src,  int src_step,  int src_offset,
-                                  __global float *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global float *mat_dst1, int dst1_step, int dst1_offset, 
+                                  __global float *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global float *mat_dst1, int dst1_step, int dst1_offset,
                                  int rows, int cols, int dst_step1)

 {
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
    {
-        int src_idx  = mad24(y, src_step,  src_offset); 
+        int src_idx  = mad24(y, src_step,  src_offset);
        int dst0_idx = mad24(y, dst0_step, dst0_offset);
        int dst1_idx = mad24(y, dst1_step, dst1_offset);
-           
+
        float2 src_data = ((__global float2 *)((__global char *)mat_src + src_idx))[x];

        ((__global float *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
@@ -1075,9 +1075,9 @@ __kernel void split_vector_C2_D5 (__global float *mat_src,  int src_step,  int s

 #if defined (DOUBLE_SUPPORT)
 __kernel void split_vector_C4_D6 (__global double *mat_src,  int src_step,  int src_offset,
-                                  __global double *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global double *mat_dst1, int dst1_step, int dst1_offset, 
-	                                __global double *mat_dst2, int dst2_step, int dst2_offset,  
+                                  __global double *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global double *mat_dst1, int dst1_step, int dst1_offset,
+                                    __global double *mat_dst2, int dst2_step, int dst2_offset,
                                  __global double *mat_dst3, int dst3_step, int dst3_offset,
                                  int rows, int cols, int dst_step1)

@@ -1085,14 +1085,14 @@ __kernel void split_vector_C4_D6 (__global double *mat_src,  int src_step,  int
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
    {
-        int src_idx  = mad24(y, src_step,  src_offset); 
+        int src_idx  = mad24(y, src_step,  src_offset);
        int dst0_idx = mad24(y, dst0_step, dst0_offset);
        int dst1_idx = mad24(y, dst1_step, dst1_offset);
        int dst2_idx = mad24(y, dst2_step, dst2_offset);
        int dst3_idx = mad24(y, dst3_step, dst3_offset);
-           
+
        double4 src_data = ((__global double4 *)((__global char *)mat_src + src_idx))[x];

        ((__global double *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
@@ -1103,18 +1103,18 @@ __kernel void split_vector_C4_D6 (__global double *mat_src,  int src_step,  int
 }

 __kernel void split_vector_C3_D6 (__global double *mat_src,  int src_step,  int src_offset,
-                                  __global double *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global double *mat_dst1, int dst1_step, int dst1_offset, 
-	                                __global double *mat_dst2, int dst2_step, int dst2_offset,  
+                                  __global double *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global double *mat_dst1, int dst1_step, int dst1_offset,
+                                    __global double *mat_dst2, int dst2_step, int dst2_offset,
                                  int rows, int cols, int dst_step1)

 {
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
    {
-        int src_idx  = mad24(y, src_step,  src_offset); 
+        int src_idx  = mad24(y, src_step,  src_offset);
        int dst0_idx = mad24(y, dst0_step, dst0_offset);
        int dst1_idx = mad24(y, dst1_step, dst1_offset);
        int dst2_idx = mad24(y, dst2_step, dst2_offset);
@@ -1130,20 +1130,20 @@ __kernel void split_vector_C3_D6 (__global double *mat_src,  int src_step,  int
 }

 __kernel void split_vector_C2_D6 (__global double *mat_src,  int src_step,  int src_offset,
-                                  __global double *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global double *mat_dst1, int dst1_step, int dst1_offset, 
+                                  __global double *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global double *mat_dst1, int dst1_step, int dst1_offset,
                                  int rows, int cols, int dst_step1)

 {
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
    {
-        int src_idx  = mad24(y, src_step,  src_offset); 
+        int src_idx  = mad24(y, src_step,  src_offset);
        int dst0_idx = mad24(y, dst0_step, dst0_offset);
        int dst1_idx = mad24(y, dst1_step, dst1_offset);
-           
+
        double2 src_data = ((__global double2 *)((__global char *)mat_src + src_idx))[x];

        ((__global double *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;