format files to ANSI C style with coolformat

change the download channels to oclchannles() fix bugs of arithm functions perf fix of bilateral bug fix of split test case add build_warps functions
2012-10-11 16:22:47 +08:00
parent 69fbc6102c
commit 97156897b2
78 changed files with 15433 additions and 12118 deletions
--- a/modules/ocl/src/kernels/arithm_absdiff.cl
+++ b/modules/ocl/src/kernels/arithm_absdiff.cl
@@ -70,9 +70,22 @@ __kernel void arithm_absdiff_D0 (__global uchar *src1, int src1_step, int src1_o
        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        uchar4 src1_data = vload4(0, src1 + src1_index);
-        uchar4 src2_data = vload4(0, src2 + src2_index);
+		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
+        uchar4 src2_data = vload4(0, src2 + src2_index_fix);
+		if(src1_index < 0)
+		{
+			uchar4 tmp;
+			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+		}
+		if(src2_index < 0)
+		{
+			uchar4 tmp;
+			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+		}	

        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
        uchar4 tmp_data = abs_diff(src1_data, src2_data);
@@ -242,9 +255,15 @@ __kernel void arithm_s_absdiff_C1_D0 (__global   uchar *src1, int src1_step, int
        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        uchar4 src1_data = vload4(0, src1 + src1_index);
+		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
        int4 src2_data = (int4)(src2.x, src2.x, src2.x, src2.x);
+		if(src1_index < 0)
+		{
+			uchar4 tmp;
+			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+		}

        uchar4 data = *((__global uchar4 *)(dst + dst_index));
        uchar4 tmp_data = convert_uchar4_sat(abs_diff(convert_int4_sat(src1_data), src2_data));
--- a/modules/ocl/src/kernels/arithm_add.cl
+++ b/modules/ocl/src/kernels/arithm_add.cl
@@ -71,10 +71,22 @@ __kernel void arithm_add_D0 (__global uchar *src1, int src1_step, int src1_offse
        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        uchar4 src1_data = vload4(0, src1 + src1_index);
-        uchar4 src2_data = vload4(0, src2 + src2_index);
-
+		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
+        uchar4 src2_data = vload4(0, src2 + src2_index_fix);
+		if(src1_index < 0)
+		{
+			uchar4 tmp;
+			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+		}
+		if(src2_index < 0)
+		{
+			uchar4 tmp;
+			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+		}		
        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
        short4 tmp      = convert_short4_sat(src1_data) + convert_short4_sat(src2_data);
        uchar4 tmp_data = convert_uchar4_sat(tmp);
@@ -248,11 +260,31 @@ __kernel void arithm_add_with_mask_C1_D0 (__global uchar *src1, int src1_step, i
        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        uchar4 src1_data = vload4(0, src1 + src1_index);
-        uchar4 src2_data = vload4(0, src2 + src2_index);
-        uchar4 mask_data = vload4(0, mask + mask_index);
-
+		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+		int mask_index_fix = mask_index < 0 ? 0 : mask_index;	
+        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
+        uchar4 src2_data = vload4(0, src2 + src2_index_fix);
+        uchar4 mask_data = vload4(0, mask + mask_index_fix);		
+		if(src1_index < 0)
+		{
+			uchar4 tmp;
+			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+		}
+		if(src2_index < 0)
+		{
+			uchar4 tmp;
+			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+		}	
+		if(mask_index < 0)
+		{
+			uchar4 tmp;
+			tmp.xyzw = (mask_index == -2) ? mask_data.zwxy:mask_data.yzwx;
+			mask_data.xyzw = (mask_index == -1) ? mask_data.wxyz:tmp.xyzw;
+		}	
+		
        uchar4 data = *((__global uchar4 *)(dst + dst_index));
        short4 tmp = convert_short4_sat(src1_data) + convert_short4_sat(src2_data);
        uchar4 tmp_data = convert_uchar4_sat(tmp);
--- a/modules/ocl/src/kernels/arithm_add_scalar.cl
+++ b/modules/ocl/src/kernels/arithm_add_scalar.cl
@@ -65,10 +65,16 @@ __kernel void arithm_s_add_C1_D0 (__global   uchar *src1, int src1_step, int src
        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        uchar4 src1_data = vload4(0, src1 + src1_index);
+		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
        int4 src2_data = (int4)(src2.x, src2.x, src2.x, src2.x);
-
+		if(src1_index < 0)
+		{
+			uchar4 tmp;
+			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+		}
+        
        uchar4 data = *((__global uchar4 *)(dst + dst_index));
        int4 tmp = convert_int4_sat(src1_data) + src2_data;
        uchar4 tmp_data = convert_uchar4_sat(tmp);
--- a/modules/ocl/src/kernels/arithm_add_scalar_mask.cl
+++ b/modules/ocl/src/kernels/arithm_add_scalar_mask.cl
@@ -68,10 +68,23 @@ __kernel void arithm_s_add_with_mask_C1_D0 (__global   uchar *src1, int src1_ste
        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        uchar4 src1_data = vload4(0, src1 + src1_index);
+		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+		int mask_index_fix = mask_index < 0 ? 0 : mask_index;	
+        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
        int4 src2_data = (int4)(src2.x, src2.x, src2.x, src2.x);
-        uchar4 mask_data = vload4(0, mask + mask_index);
+        uchar4 mask_data = vload4(0, mask + mask_index_fix);		
+		if(src1_index < 0)
+		{
+			uchar4 tmp;
+			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+		}
+		if(mask_index < 0)
+		{
+			uchar4 tmp;
+			tmp.xyzw = (mask_index == -2) ? mask_data.zwxy:mask_data.yzwx;
+			mask_data.xyzw = (mask_index == -1) ? mask_data.wxyz:tmp.xyzw;
+		}	

        uchar4 data = *((__global uchar4 *)(dst + dst_index));
        int4 tmp = convert_int4_sat(src1_data) + src2_data;
--- a/modules/ocl/src/kernels/arithm_flip.cl
+++ b/modules/ocl/src/kernels/arithm_flip.cl
@@ -71,9 +71,22 @@ __kernel void arithm_flip_rows_D0 (__global uchar *src, int src_step, int src_of
        int dst_end_1    = mad24(rows - y - 1, dst_step, dst_offset + dst_step1);
        int dst_index_0  = mad24(y,            dst_step, dst_offset + x & (int)0xfffffffc);
        int dst_index_1  = mad24(rows - y - 1, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        uchar4 src_data_0 = vload4(0, src + src_index_0);
-        uchar4 src_data_1 = vload4(0, src + src_index_1);
+		int src1_index_fix = src_index_0 < 0 ? 0 : src_index_0;
+		int src2_index_fix = src_index_1 < 0 ? 0 : src_index_1;
+        uchar4 src_data_0 = vload4(0, src + src1_index_fix);
+        uchar4 src_data_1 = vload4(0, src + src2_index_fix);
+		if(src_index_0 < 0)
+		{
+			uchar4 tmp;
+			tmp.xyzw = (src_index_0 == -2) ? src_data_0.zwxy:src_data_0.yzwx;
+			src_data_0.xyzw = (src_index_0 == -1) ? src_data_0.wxyz:tmp.xyzw;
+		}
+		if(src_index_1 < 0)
+		{
+			uchar4 tmp;
+			tmp.xyzw = (src_index_1 == -2) ? src_data_1.zwxy:src_data_1.yzwx;
+			src_data_1.xyzw = (src_index_1 == -1) ? src_data_1.wxyz:tmp.xyzw;
+		}

        uchar4 dst_data_0 = *((__global uchar4 *)(dst + dst_index_0));
        uchar4 dst_data_1 = *((__global uchar4 *)(dst + dst_index_1));
--- a/modules/ocl/src/kernels/build_warps.cl
+++ b/modules/ocl/src/kernels/build_warps.cl
@@ -0,0 +1,237 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Peng Xiao, pengxiao@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+__kernel
+    void buildWarpPlaneMaps
+    (
+    __global float * map_x,
+    __global float * map_y,
+    __constant float * KRT,
+    int tl_u,
+    int tl_v,
+    int cols,
+    int rows,
+    int step_x,
+    int step_y,
+    float scale
+    )
+{
+    int du = get_global_id(0);
+    int dv = get_global_id(1);
+    step_x /= sizeof(float);
+    step_y /= sizeof(float);
+
+    __constant float * ck_rinv = KRT;
+    __constant float * ct      = KRT + 9;
+
+    if (du < cols && dv < rows)
+    {
+        float u = tl_u + du;
+        float v = tl_v + dv;
+        float x, y;
+
+        float x_ = u / scale - ct[0];
+        float y_ = v / scale - ct[1];
+
+        float z;
+        x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * (1 - ct[2]);
+        y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * (1 - ct[2]);
+        z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * (1 - ct[2]);
+
+        x /= z;
+        y /= z;
+
+        map_x[dv * step_x + du] = x;
+        map_y[dv * step_y + du] = y;
+    }
+}
+
+__kernel
+    void buildWarpCylindricalMaps
+    (
+    __global float * map_x,
+    __global float * map_y,
+    __constant float * ck_rinv,
+    int tl_u,
+    int tl_v,
+    int cols,
+    int rows,
+    int step_x,
+    int step_y,
+    float scale
+    )
+{
+    int du = get_global_id(0);
+    int dv = get_global_id(1);
+    step_x /= sizeof(float);
+    step_y /= sizeof(float);
+
+    if (du < cols && dv < rows)
+    {
+        float u = tl_u + du;
+        float v = tl_v + dv;
+        float x, y;
+
+        u /= scale;
+        float x_ = sin(u);
+        float y_ = v / scale;
+        float z_ = cos(u);
+
+        float z;
+        x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * z_;
+        y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * z_;
+        z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * z_;
+
+        if (z > 0) { x /= z; y /= z; }
+        else x = y = -1;
+
+        map_x[dv * step_x + du] = x;
+        map_y[dv * step_y + du] = y;
+    }
+}
+
+__kernel
+    void buildWarpSphericalMaps
+    (
+    __global float * map_x,
+    __global float * map_y,
+    __constant float * ck_rinv,
+    int tl_u,
+    int tl_v,
+    int cols,
+    int rows,
+    int step_x,
+    int step_y,
+    float scale
+    )
+{
+    int du = get_global_id(0);
+    int dv = get_global_id(1);
+    step_x /= sizeof(float);
+    step_y /= sizeof(float);
+
+    if (du < cols && dv < rows)
+    {
+        float u = tl_u + du;
+        float v = tl_v + dv;
+        float x, y;
+
+        v /= scale;
+        u /= scale;
+
+        float sinv = sin(v);
+        float x_ = sinv * sin(u);
+        float y_ = - cos(v);
+        float z_ = sinv * cos(u);
+
+        float z;
+        x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * z_;
+        y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * z_;
+        z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * z_;
+
+        if (z > 0) { x /= z; y /= z; }
+        else x = y = -1;
+
+        map_x[dv * step_x + du] = x;
+        map_y[dv * step_y + du] = y;
+    }
+}
+
+__kernel
+    void buildWarpAffineMaps
+    (
+    __global float * xmap,
+    __global float * ymap,
+    __constant float * c_warpMat,
+    int cols,
+    int rows,
+    int step_x,
+    int step_y
+    )
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    step_x /= sizeof(float);
+    step_y /= sizeof(float);
+
+    if (x < cols && y < rows)
+    {
+        const float xcoo = c_warpMat[0] * x + c_warpMat[1] * y + c_warpMat[2];
+        const float ycoo = c_warpMat[3] * x + c_warpMat[4] * y + c_warpMat[5];
+
+        map_x[y * step_x + x] = xcoo;
+        map_y[y * step_y + x] = ycoo;
+    }
+}
+
+__kernel
+    void buildWarpPerspectiveMaps
+    (
+    __global float * xmap,
+    __global float * ymap,
+    __constant float * c_warpMat,
+    int cols,
+    int rows,
+    int step_x,
+    int step_y
+    )
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    step_x /= sizeof(float);
+    step_y /= sizeof(float);
+
+    if (x < cols && y < rows)
+    {
+        const float coeff = 1.0f / (c_warpMat[6] * x + c_warpMat[7] * y + c_warpMat[8]);
+
+        const float xcoo = coeff * (c_warpMat[0] * x + c_warpMat[1] * y + c_warpMat[2]);
+        const float ycoo = coeff * (c_warpMat[3] * x + c_warpMat[4] * y + c_warpMat[5]);
+
+        map_x[y * step_x + x] = xcoo;
+        map_y[y * step_y + x] = ycoo;
+    }
+}
+
--- a/modules/ocl/src/kernels/filtering_boxFilter.cl
+++ b/modules/ocl/src/kernels/filtering_boxFilter.cl
@@ -254,7 +254,8 @@ __kernel void boxFilter_C4_D0(__global const uchar4 * restrict src, __global uch
        //ss = convert_uint4(src[cur_addr]); 

        int cur_col = clamp(startX + col, 0, src_whole_cols);
-        ss = convert_uint4(src[(startY+i)*(src_step>>2) + cur_col]); 
+        if(con)
+          ss = convert_uint4(src[(startY+i)*(src_step>>2) + cur_col]); 

        data[i] = con ? ss : 0;
    }
@@ -269,6 +270,7 @@ __kernel void boxFilter_C4_D0(__global const uchar4 * restrict src, __global uch
          selected_col = ADDR_L(startX+col, 0, src_whole_cols);
          selected_col = ADDR_R(startX+col, src_whole_cols, selected_col);
          
+          
          data[i] = convert_uint4(src[selected_row * (src_step>>2) + selected_col]);
   }
    
@@ -334,11 +336,12 @@ __kernel void boxFilter_C1_D5(__global const float *restrict src, __global float
    for(int i=0; i < ksY+1; i++)
    {
        con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows;
-	    //	int cur_addr = clamp((startY+i)*(src_step>>2)+(startX+col),0,end_addr);		
-       // ss = src[cur_addr]; 
-
+	      //int cur_addr = clamp((startY+i)*(src_step>>2)+(startX+col),0,end_addr);		
+        //ss = src[cur_addr]; 
+         
        int cur_col = clamp(startX + col, 0, src_whole_cols);
-        ss = src[(startY+i)*(src_step>>2) + cur_col]; 
+        //ss = src[(startY+i)*(src_step>>2) + cur_col]; 
+        ss = (startY+i)<src_whole_rows&&(startY+i)>=0&&cur_col>=0&&cur_col<src_whole_cols?src[(startY+i)*(src_step>>2) + cur_col]:0;

        data[i] = con ? ss : 0.f;
    }
@@ -422,7 +425,8 @@ __kernel void boxFilter_C4_D5(__global const float4 *restrict src, __global floa
        //ss = src[cur_addr]; 

        int cur_col = clamp(startX + col, 0, src_whole_cols);
-        ss = src[(startY+i)*(src_step>>4) + cur_col]; 
+        //ss = src[(startY+i)*(src_step>>4) + cur_col]; 
+        ss = (startY+i)<src_whole_rows&&(startY+i)>=0&&cur_col>=0&&cur_col<src_whole_cols?src[(startY+i)*(src_step>>4) + cur_col]:0;

        data[i] = con ? ss : (float4)(0.0,0.0,0.0,0.0);
    }
--- a/modules/ocl/src/kernels/imgproc_bilateral.cl
+++ b/modules/ocl/src/kernels/imgproc_bilateral.cl
@@ -31,84 +31,8 @@
 // and on any theory of liability, whether in contract, strict liability,
 // or tort (including negligence or otherwise) arising in any way out of
 // the use of this software, even if advised of the possibility of such damage.
-//
-//

-
-//#pragma OPENCL EXTENSION cl_amd_printf :enable
-__kernel
-void bilateral4(__global uchar4 *dst,
-		__global uchar4 *src,
-		int rows,
-		int cols,
-		int channels,
-		int radius,
-		int wholerows,
-		int wholecols,
-		int src_step,
-		int dst_step,
-		int src_offset,
-		int dst_offset,
-		__constant float *sigClr,
-		__constant float *sigSpc)
-{
-	uint lidx = get_local_id(0);
-	uint lidy = get_local_id(1);
-	
-	uint gdx = get_global_id(0);
-	uint gdy = get_global_id(1);
-
-	uint gidx = gdx >=cols?cols-1:gdx;
-	uint gidy = gdy >=rows?rows-1:gdy;
-
-	uchar4 p,q,tmp;
-
-	float4 pf = 0,pq = 0,pd = 0;
-        float wt =0;
-
-	int r = radius;
-	int ij = 0;
-	int ct = 0;
-
-	uint index_src = src_offset/4 + gidy*src_step/4 + gidx;
-	uint index_dst = dst_offset/4 + gidy*dst_step/4 + gidx;
-
-	p = src[index_src];
-
-	uint gx,gy;
-	uint src_index,dst_index;
-
-	for(int ii = -r;ii<r+1;ii++)
-	{
-		for(int jj =-r;jj<r+1;jj++)
-			{
-					ij = ii*ii+jj*jj;
-					if(ij > mul24(radius,radius)) continue;
-					gx = gidx + jj;
-					gy = gidy + ii;
-
-					src_index = src_offset/4 + gy *	 src_step/4 + gx;
-					q = src[src_index];
-					
-
-					ct = abs(p.x-q.x)+abs(p.y-q.y)+abs(p.z-q.z);
-					wt =sigClr[ct]*sigSpc[(ii+radius)*(2*radius+1)+jj+radius];
-
-				        pf.x += q.x*wt;
-					pf.y += q.y*wt;
-					pf.z += q.z*wt;
-//					pf.w += q.w*wt;
-
-					pq += wt;
-
-			}
-	}
-
-	pd = pf/pq;
-	dst[index_dst] = convert_uchar4_rte(pd);
-}
-
-__kernel void bilateral(__global uchar *dst,
+__kernel void bilateral_C1_D0(__global uchar *dst,
 		__global const uchar *src,
 		const int dst_rows,
 		const int dst_cols,
@@ -128,8 +52,8 @@ __kernel void bilateral(__global uchar *dst,
 	if((gidy<dst_rows) && (gidx<dst_cols))
 	{
 		int src_addr = mad24(gidy+radius,src_step,gidx+radius);
-		int dst_addr = mad24(gidy,src_step,gidx+dst_offset);
-		float sum = 0, wsum = 0;
+		int dst_addr = mad24(gidy,dst_step,gidx+dst_offset);
+		float sum = 0.f, wsum = 0.f;

 		int val0 = (int)src[src_addr];
 		for(int k = 0; k < maxk; k++ )
@@ -142,4 +66,73 @@ __kernel void bilateral(__global uchar *dst,
 		dst[dst_addr] = convert_uchar_rtz(sum/wsum+0.5f);
 	}
 }
+__kernel void bilateral2_C1_D0(__global uchar *dst,
+		__global const uchar *src,
+		const int dst_rows,
+		const int dst_cols,
+		const int maxk,
+		const int radius,
+		const int dst_step,
+		const int dst_offset,
+		const int src_step,
+		const int src_rows,
+		const int src_cols,
+		__constant float *color_weight,
+		__constant float *space_weight,
+		__constant int *space_ofs)
+{	
+	int gidx = get_global_id(0)<<2;
+	int gidy = get_global_id(1);
+	if((gidy<dst_rows) && (gidx<dst_cols))
+	{
+		int src_addr = mad24(gidy+radius,src_step,gidx+radius);
+		int dst_addr = mad24(gidy,dst_step,gidx+dst_offset);
+		float4 sum = (float4)(0.f), wsum = (float4)(0.f);

+		int4 val0 = convert_int4(vload4(0,src+src_addr));
+		for(int k = 0; k < maxk; k++ )
+		{
+			int4 val = convert_int4(vload4(0,src+src_addr + space_ofs[k]));
+			float4 w = (float4)(space_weight[k])*(float4)(color_weight[abs(val.x - val0.x)],color_weight[abs(val.y - val0.y)],color_weight[abs(val.z - val0.z)],color_weight[abs(val.w - val0.w)]);
+			sum += convert_float4(val)*w;
+			wsum += w;
+		}
+		*(__global uchar4*)(dst+dst_addr) = convert_uchar4_rtz(sum/wsum+0.5f);
+	}
+}
+__kernel void bilateral_C4_D0(__global uchar4 *dst,
+		__global const uchar4 *src,
+		const int dst_rows,
+		const int dst_cols,
+		const int maxk,
+		const int radius,
+		const int dst_step,
+		const int dst_offset,
+		const int src_step,
+		const int src_rows,
+		const int src_cols,
+		__constant float *color_weight,
+		__constant float *space_weight,
+		__constant int *space_ofs)
+{	
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+	if((gidy<dst_rows) && (gidx<dst_cols))
+	{
+		int src_addr = mad24(gidy+radius,src_step,gidx+radius);
+		int dst_addr = mad24(gidy,dst_step,gidx+dst_offset);
+		float4 sum = (float4)0.f;
+		float wsum = 0.f;
+
+		int4 val0 = convert_int4(src[src_addr]);
+		for(int k = 0; k < maxk; k++ )
+		{
+			int4 val = convert_int4(src[src_addr + space_ofs[k]]);
+			float w = space_weight[k]*color_weight[abs(val.x - val0.x)+abs(val.y - val0.y)+abs(val.z - val0.z)];
+			sum += convert_float4(val)*(float4)w;
+			wsum += w;
+		}
+		wsum=1.f/wsum;
+		dst[dst_addr] = convert_uchar4_rtz(sum*(float4)wsum+(float4)0.5f);
+	}
+}
--- a/modules/ocl/src/kernels/imgproc_histogram.cl
+++ b/modules/ocl/src/kernels/imgproc_histogram.cl
@@ -144,16 +144,18 @@ __kernel void __attribute__((reqd_work_group_size(1,HISTOGRAM256_BIN_COUNT,1)))c
        int rowIndex = mad24(gy, gn, gx);
 //        rowIndex &= (PARTIAL_HISTOGRAM256_COUNT - 1);

-        __local int subhist[HISTOGRAM256_LOCAL_MEM_SIZE + 1];
+        __local int subhist[HISTOGRAM256_LOCAL_MEM_SIZE];
        subhist[lidy] = 0;
        barrier(CLK_LOCAL_MEM_FENCE);

        gidx = ((gidx>=left_col) ? (gidx+cols) : gidx);
-        int src_index = src_offset + mad24(gidy, src_step, gidx);
-	barrier(CLK_LOCAL_MEM_FENCE);
-        int p = (int)src[src_index];
-	p = gidy >= rows ? HISTOGRAM256_LOCAL_MEM_SIZE : p;
-        atomic_inc(subhist + p);
+        if(gidy<rows)
+        {
+            int src_index = src_offset + mad24(gidy, src_step, gidx);
+            int p = (int)src[src_index];
+//	    p = gidy >= rows ? HISTOGRAM256_LOCAL_MEM_SIZE : p;
+            atomic_inc(subhist + p);
+        }
        barrier(CLK_LOCAL_MEM_FENCE);

        globalHist[mad24(rowIndex, hist_step, lidy)] += subhist[lidy];
--- a/modules/ocl/src/kernels/interpolate_frames.cl
+++ b/modules/ocl/src/kernels/interpolate_frames.cl
@@ -0,0 +1,252 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Peng Xiao, pengxiao@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+
+// Image read mode
+__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;
+
+// atomic add for 32bit floating point
+inline void atomic_addf(volatile __global float *source, const float operand) {
+    union {
+        unsigned int intVal;
+        float floatVal;
+    } newVal;
+    union {
+        unsigned int intVal;
+        float floatVal;
+    } prevVal;
+    do {
+        prevVal.floatVal = *source;
+        newVal.floatVal = prevVal.floatVal + operand;
+    } while (atomic_cmpxchg((volatile __global unsigned int *)source, prevVal.intVal, newVal.intVal) != prevVal.intVal);
+}
+
+__kernel void memsetKernel(
+    float val,
+    __global float * image,
+    int width,
+    int height,
+    int step, // in element
+    int offset
+    )
+{
+    if(get_global_id(0) >= width || get_global_id(1) >= height)
+    {
+        return;
+    }
+    image += offset;
+    image[get_global_id(0) + get_global_id(1) * step] = val;
+}
+
+__kernel void normalizeKernel(
+    __global float * buffer,
+    int width,
+    int height,
+    int step,
+    int f_offset,
+    int d_offset
+    )
+{
+    __global float * factors = buffer + f_offset;
+    __global float * dst     = buffer + d_offset;
+
+    int j = get_global_id(0);
+    int i = get_global_id(1);
+
+    if(j >= width || i >= height)
+    {
+        return;
+    }
+    float scale = factors[step * i + j];
+    float invScale = (scale == 0.0f) ? 1.0f : (1.0f / scale);
+
+    dst[step * i + j] *= invScale;
+}
+
+__kernel void forwardWarpKernel(
+    __global const float * src,
+    __global float * buffer,
+    __global const float * u,
+    __global const float * v,
+    const int w,
+    const int h,
+    const int flow_stride,
+    const int image_stride,
+    const int factor_offset,
+    const int dst_offset,
+    const float time_scale
+    )
+{
+    int j = get_global_id(0);
+    int i = get_global_id(1);
+
+    if (i >= h || j >= w) return;
+
+    volatile __global float * normalization_factor = (volatile __global float *) buffer + factor_offset;
+    volatile __global float * dst = (volatile __global float *)buffer + dst_offset;
+
+    int flow_row_offset  = i * flow_stride;
+    int image_row_offset = i * image_stride;
+
+    //bottom left corner of a target pixel
+    float cx = u[flow_row_offset + j] * time_scale + (float)j + 1.0f;
+    float cy = v[flow_row_offset + j] * time_scale + (float)i + 1.0f;
+    // pixel containing bottom left corner
+    float px;
+    float py;
+    float dx = modf(cx, &px);
+    float dy = modf(cy, &py);
+    // target pixel integer coords
+    int tx;
+    int ty;
+    tx = (int) px;
+    ty = (int) py;
+    float value = src[image_row_offset + j];
+    float weight;
+    // fill pixel containing bottom right corner
+    if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
+    {
+        weight = dx * dy;
+        atomic_addf(dst + ty * image_stride + tx, value * weight);
+        atomic_addf(normalization_factor + ty * image_stride + tx, weight);
+    }
+
+    // fill pixel containing bottom left corner
+    tx -= 1;
+    if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
+    {
+        weight = (1.0f - dx) * dy;
+        atomic_addf(dst + ty * image_stride + tx, value * weight);
+        atomic_addf(normalization_factor + ty * image_stride + tx, weight);
+    }
+
+    // fill pixel containing upper left corner
+    ty -= 1;
+    if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
+    {
+        weight = (1.0f - dx) * (1.0f - dy);
+        atomic_addf(dst + ty * image_stride + tx, value * weight);
+        atomic_addf(normalization_factor + ty * image_stride + tx, weight);
+    }
+
+    // fill pixel containing upper right corner
+    tx += 1;
+    if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
+    {
+        weight = dx * (1.0f - dy);
+        atomic_addf(dst + ty * image_stride + tx, value * weight);
+        atomic_addf(normalization_factor + ty * image_stride + tx, weight);
+    }
+}
+
+// define buffer offsets
+enum
+{
+    O0_OS = 0,
+    O1_OS,
+    U_OS,
+    V_OS,
+    UR_OS,
+    VR_OS
+};
+
+__kernel void blendFramesKernel(
+    image2d_t tex_src0,
+    image2d_t tex_src1,
+    __global float * buffer,
+    __global float * out,
+    int w,
+    int h,
+    int step,
+    float theta
+    )
+{
+    __global float * u  = buffer + h * step * U_OS;
+    __global float * v  = buffer + h * step * V_OS;
+    __global float * ur = buffer + h * step * UR_OS;
+    __global float * vr = buffer + h * step * VR_OS;
+    __global float * o0 = buffer + h * step * O0_OS;
+    __global float * o1 = buffer + h * step * O1_OS;
+
+    int ix = get_global_id(0);
+    int iy = get_global_id(1);
+
+    if(ix >= w || iy >= h) return;
+
+    int pos = ix + step * iy;
+
+    float _u  = u[pos];
+    float _v  = v[pos];
+
+    float _ur = ur[pos];
+    float _vr = vr[pos];
+
+    float x = (float)ix + 0.5f;
+    float y = (float)iy + 0.5f;
+    bool b0 = o0[pos] > 1e-4f;
+    bool b1 = o1[pos] > 1e-4f;
+
+    float2 coord0 = (float2)(x - _u * theta, y - _v * theta);
+    float2 coord1 = (float2)(x + _u * (1.0f - theta), y + _v * (1.0f - theta));
+
+    if (b0 && b1)
+    {
+        // pixel is visible on both frames
+        out[pos] = read_imagef(tex_src0, sampler, coord0).x * (1.0f - theta) + 
+            read_imagef(tex_src1, sampler, coord1).x * theta;
+    }
+    else if (b0)
+    {
+        // visible on the first frame only
+        out[pos] = read_imagef(tex_src0, sampler, coord0).x;
+    }
+    else
+    {
+        // visible on the second frame only
+        out[pos] = read_imagef(tex_src1, sampler, coord1).x;
+    }
+}