Add blend, columnsum, pyrUp/down, matchTemplate for ocl module

2012-08-06 15:00:27 +08:00
parent 064d022a4b
commit 0db69fbe19
16 changed files with 4085 additions and 1 deletions
--- a/modules/ocl/src/kernels/blend_linear.cl
+++ b/modules/ocl/src/kernels/blend_linear.cl
@@ -0,0 +1,196 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, MulticoreWare Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Liu Liujun, liujun@multicorewareinc.com 
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+__kernel void BlendLinear_C1_D0(
+	__global uchar *dst,
+	__global uchar *img1,
+	__global uchar *img2,
+	__global float *weight1,
+	__global float *weight2,
+	int rows,
+	int cols,
+	int istep,
+	int wstep
+	)
+{
+	int idx = get_global_id(0);
+	int idy = get_global_id(1);
+	if (idx < cols && idy < rows)
+	{
+		int pos = idy * istep + idx;
+		int wpos = idy * (wstep /sizeof(float)) + idx;
+		float w1 = weight1[wpos];
+		float w2 = weight2[wpos];
+		dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
+
+	}
+}
+
+__kernel void BlendLinear_C3_D0(
+	__global uchar *dst,
+	__global uchar *img1,
+	__global uchar *img2,
+	__global float *weight1,
+	__global float *weight2,
+	int rows,
+	int cols,
+	int istep,
+	int wstep
+	)
+{
+	int idx = get_global_id(0);
+	int idy = get_global_id(1);
+	int x = idx / 3;
+	int y = idy;
+	if (x < cols && y < rows)
+	{
+		int pos = idy * istep + idx;
+		int wpos = idy * (wstep /sizeof(float)) + x;
+		float w1 = weight1[wpos];
+		float w2 = weight2[wpos];
+		dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
+	}
+}
+
+__kernel void BlendLinear_C4_D0(
+	__global uchar *dst,
+	__global uchar *img1,
+	__global uchar *img2,
+	__global float *weight1,
+	__global float *weight2,
+	int rows,
+	int cols,
+	int istep,
+	int wstep
+	)
+{
+	int idx = get_global_id(0);
+	int idy = get_global_id(1);
+	int x = idx / 4;
+	int y = idy;
+	if (x < cols && y < rows)
+	{
+		int pos = idy * istep + idx;
+		int wpos = idy * (wstep /sizeof(float)) + x;
+		float w1 = weight1[wpos];
+		float w2 = weight2[wpos];
+		dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
+	}
+}
+
+__kernel void BlendLinear_C1_D5(
+	__global float *dst,
+	__global float *img1,
+	__global float *img2,
+	__global float *weight1,
+	__global float *weight2,
+	int rows,
+	int cols,
+	int istep,
+	int wstep
+	)
+{
+	int idx = get_global_id(0);
+	int idy = get_global_id(1);
+	if (idx < cols && idy < rows)
+	{
+		int pos = idy * (istep / sizeof(float)) + idx;
+		int wpos = idy * (wstep /sizeof(float)) + idx;
+		float w1 = weight1[wpos];
+		float w2 = weight2[wpos];
+		dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
+	}
+}
+
+__kernel void BlendLinear_C3_D5(
+	__global float *dst,
+	__global float *img1,
+	__global float *img2,
+	__global float *weight1,
+	__global float *weight2,
+	int rows,
+	int cols,
+	int istep,
+	int wstep
+	)
+{
+	int idx = get_global_id(0);
+	int idy = get_global_id(1);
+	int x = idx / 3;
+	int y = idy;
+	if (x < cols && y < rows)
+	{
+		int pos = idy * (istep / sizeof(float)) + idx;
+		int wpos = idy * (wstep /sizeof(float)) + x;
+		float w1 = weight1[wpos];
+		float w2 = weight2[wpos];
+		dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
+	}
+}
+
+__kernel void BlendLinear_C4_D5(
+	__global float *dst,
+	__global float *img1,
+	__global float *img2,
+	__global float *weight1,
+	__global float *weight2,
+	int rows,
+	int cols,
+	int istep,
+	int wstep
+	)
+{
+	int idx = get_global_id(0);
+	int idy = get_global_id(1);
+	int x = idx / 4;
+	int y = idy;
+	if (x < cols && y < rows)
+	{
+		int pos = idy * (istep / sizeof(float)) + idx;
+		int wpos = idy * (wstep /sizeof(float)) + x;
+		float w1 = weight1[wpos];
+		float w2 = weight2[wpos];
+		dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
+	}
+}
--- a/modules/ocl/src/kernels/imgproc_columnsum.cl
+++ b/modules/ocl/src/kernels/imgproc_columnsum.cl
@@ -0,0 +1,80 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Chunpeng Zhang chunpeng@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+#if defined (__ATI__)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+
+#elif defined (__NVIDIA__)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+
+////////////////////////////////////////////////////////////////////
+///////////////////////// columnSum ////////////////////////////////
+////////////////////////////////////////////////////////////////////
+/// CV_32FC1
+__kernel void columnSum_C1_D5(__global float* src,__global float* dst,int srcCols,int srcRows,int srcStep,int dstStep)
+{
+	const int x = get_global_id(0);
+	
+	srcStep >>= 2;
+	dstStep >>= 2;
+
+	if (x < srcCols)
+    {
+		int srcIdx = x ;
+		int dstIdx = x ;
+
+        float sum = 0;
+		
+        for (int y = 0; y < srcRows; ++y)
+        {
+			sum += src[srcIdx];
+            dst[dstIdx] = sum;
+			srcIdx += srcStep;
+			dstIdx += dstStep;	
+        }
+	}
+}
--- a/modules/ocl/src/kernels/match_template.cl
+++ b/modules/ocl/src/kernels/match_template.cl
@@ -0,0 +1,824 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Peng Xiao, pengxiao@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+
+#if defined (__ATI__)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+
+#elif defined (__NVIDIA__)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+
+#if !defined(USE_SQR_INTEGRAL) && (defined (__ATI__) || defined (__NVIDIA__))
+#define TYPE_IMAGE_SQSUM double
+#else
+#define TYPE_IMAGE_SQSUM ulong
+#endif
+
+//////////////////////////////////////////////////
+// utilities
+#define SQSUMS_PTR(ox, oy) mad24(gidy + oy, img_sqsums_step, gidx + img_sqsums_offset + ox)
+#define SUMS_PTR(ox, oy) mad24(gidy + oy, img_sums_step, gidx + img_sums_offset + ox)
+// normAcc* are accurate normalization routines which make GPU matchTemplate
+// consistent with CPU one
+float normAcc(float num, float denum)
+{
+	if(fabs(num) < denum)
+	{
+		return num / denum;
+	}
+	if(fabs(num) < denum * 1.125f)
+	{
+		return num > 0 ? 1 : -1;
+	}
+	return 0;
+}
+
+float normAcc_SQDIFF(float num, float denum)
+{
+	if(fabs(num) < denum)
+	{
+		return num / denum;
+	}
+	if(fabs(num) < denum * 1.125f)
+	{
+		return num > 0 ? 1 : -1;
+	}
+	return 1;
+}
+//////////////////////////////////////////////////////////////////////
+// normalize
+
+__kernel 
+void normalizeKernel_C1_D0
+(
+	__global const TYPE_IMAGE_SQSUM * img_sqsums,
+	__global float * res,
+	ulong tpl_sqsum,
+	int res_rows,
+	int res_cols,
+	int tpl_rows,
+	int tpl_cols, 
+	int img_sqsums_offset,
+	int img_sqsums_step,
+	int res_offset,
+	int res_step
+)
+{
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+
+	res_step   /= sizeof(*res);
+	res_offset /= sizeof(*res);
+	img_sqsums_step /= sizeof(*img_sqsums);
+	img_sqsums_offset /= sizeof(*img_sqsums);
+	int res_idx = mad24(gidy, res_step, res_offset + gidx);
+	if(gidx < res_cols && gidy < res_rows)
+	{
+		float image_sqsum_ = (float)(
+			(img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) -
+			(img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)]));
+		res[res_idx] = normAcc(res[res_idx], sqrt(image_sqsum_ * tpl_sqsum));
+	}
+}
+
+__kernel 
+void matchTemplate_Prepared_SQDIFF_C1_D0
+(
+	__global const TYPE_IMAGE_SQSUM * img_sqsums,
+	__global float * res,
+	ulong tpl_sqsum,
+	int res_rows,
+	int res_cols,
+	int tpl_rows,
+	int tpl_cols, 
+	int img_sqsums_offset,
+	int img_sqsums_step,
+	int res_offset,
+	int res_step
+)
+{
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+
+	res_step   /= sizeof(*res);
+	res_offset /= sizeof(*res);
+	img_sqsums_step /= sizeof(*img_sqsums);
+	img_sqsums_offset /= sizeof(*img_sqsums);
+	int res_idx = mad24(gidy, res_step, res_offset + gidx);
+	if(gidx < res_cols && gidy < res_rows)
+	{
+		float image_sqsum_ = (float)(
+			(img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) -
+			(img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)]));
+		res[res_idx] = image_sqsum_ - 2.f * res[res_idx] + tpl_sqsum;
+	}
+}
+
+__kernel 
+void matchTemplate_Prepared_SQDIFF_NORMED_C1_D0
+(
+	__global const TYPE_IMAGE_SQSUM * img_sqsums,
+	__global float * res,
+	ulong tpl_sqsum,
+	int res_rows,
+	int res_cols,
+	int tpl_rows,
+	int tpl_cols, 
+	int img_sqsums_offset,
+	int img_sqsums_step,
+	int res_offset,
+	int res_step
+)
+{
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+
+	res_step   /= sizeof(*res);
+	res_offset /= sizeof(*res);
+	img_sqsums_step /= sizeof(*img_sqsums);
+	img_sqsums_offset /= sizeof(*img_sqsums);
+	int res_idx = mad24(gidy, res_step, res_offset + gidx);
+	if(gidx < res_cols && gidy < res_rows)
+	{
+		float image_sqsum_ = (float)(
+			(img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) -
+			(img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)]));
+		res[res_idx] = normAcc_SQDIFF(image_sqsum_ - 2.f * res[res_idx] + tpl_sqsum,
+										sqrt(image_sqsum_ * tpl_sqsum));
+	}
+}
+
+//////////////////////////////////////////////////
+// SQDIFF
+__kernel 
+void matchTemplate_Naive_SQDIFF_C1_D0
+(
+	__global const uchar * img,
+	__global const uchar * tpl,
+	__global float * res,
+	int img_rows,
+	int img_cols,
+	int tpl_rows,
+	int tpl_cols, 
+	int res_rows,
+	int res_cols,
+	int img_offset,
+	int tpl_offset,
+	int res_offset,
+	int img_step,
+	int tpl_step,
+	int res_step
+)
+{
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+	int i,j;
+	int delta;
+	int sum = 0;
+	res_step   /= sizeof(*res);
+	res_offset /= sizeof(*res);
+	int res_idx = mad24(gidy, res_step, res_offset + gidx);
+
+	if(gidx < res_cols && gidy < res_rows)
+	{
+		for(i = 0; i < tpl_rows; i ++)
+		{
+			// get specific rows of img data
+			__global const uchar * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
+			__global const uchar * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
+			for(j = 0; j < tpl_cols; j ++)
+			{
+				delta = img_ptr[j] - tpl_ptr[j];
+				sum   = mad24(delta, delta, sum);
+			}
+		}
+		res[res_idx] = sum;
+	}
+}
+
+__kernel 
+void matchTemplate_Naive_SQDIFF_C1_D5
+(
+	__global const float * img,
+	__global const float * tpl,
+	__global float * res,
+	int img_rows,
+	int img_cols,
+	int tpl_rows,
+	int tpl_cols, 
+	int res_rows,
+	int res_cols,
+	int img_offset,
+	int tpl_offset,
+	int res_offset,
+	int img_step,
+	int tpl_step,
+	int res_step
+)
+{
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+	int i,j;
+	float delta;
+	float sum = 0;
+	img_step   /= sizeof(*img);
+	img_offset /= sizeof(*img);
+	tpl_step   /= sizeof(*tpl);
+	tpl_offset /= sizeof(*tpl);
+	res_step   /= sizeof(*res);
+	res_offset /= sizeof(*res);
+
+	int res_idx = mad24(gidy, res_step, res_offset + gidx);
+
+	if(gidx < res_cols && gidy < res_rows)
+	{
+		for(i = 0; i < tpl_rows; i ++)
+		{
+			// get specific rows of img data
+			__global const float * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
+			__global const float * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
+			for(j = 0; j < tpl_cols; j ++)
+			{
+				delta = img_ptr[j] - tpl_ptr[j];
+				sum   = mad(delta, delta, sum);
+			}
+		}
+		res[res_idx] = sum;
+	}
+}
+
+__kernel 
+void matchTemplate_Naive_SQDIFF_C4_D0
+(
+	__global const uchar4 * img,
+	__global const uchar4 * tpl,
+	__global float * res,
+	int img_rows,
+	int img_cols,
+	int tpl_rows,
+	int tpl_cols, 
+	int res_rows,
+	int res_cols,
+	int img_offset,
+	int tpl_offset,
+	int res_offset,
+	int img_step,
+	int tpl_step,
+	int res_step
+)
+{
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+	int i,j;
+	int4 delta;
+	int4 sum = (int4)(0, 0, 0, 0);
+	img_step   /= sizeof(*img);
+	img_offset /= sizeof(*img);
+	tpl_step   /= sizeof(*tpl);
+	tpl_offset /= sizeof(*tpl);
+	res_step   /= sizeof(*res);
+	res_offset /= sizeof(*res);
+
+	int res_idx = mad24(gidy, res_step, res_offset + gidx);
+
+	if(gidx < res_cols && gidy < res_rows)
+	{
+		for(i = 0; i < tpl_rows; i ++)
+		{
+			// get specific rows of img data
+			__global const uchar4 * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
+			__global const uchar4 * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
+			for(j = 0; j < tpl_cols; j ++)
+			{
+				//delta = convert_int4(img_ptr[j] - tpl_ptr[j]); // this alternative is incorrect
+				delta.x = img_ptr[j].x - tpl_ptr[j].x;
+				delta.y = img_ptr[j].y - tpl_ptr[j].y;
+				delta.z = img_ptr[j].z - tpl_ptr[j].z;
+				delta.w = img_ptr[j].w - tpl_ptr[j].w;
+				sum   = mad24(delta, delta, sum);
+			}
+		}
+		res[res_idx] = sum.x + sum.y + sum.z + sum.w;
+	}
+}
+
+__kernel 
+void matchTemplate_Naive_SQDIFF_C4_D5
+(
+	__global const float4 * img,
+	__global const float4 * tpl,
+	__global float * res,
+	int img_rows,
+	int img_cols,
+	int tpl_rows,
+	int tpl_cols, 
+	int res_rows,
+	int res_cols,
+	int img_offset,
+	int tpl_offset,
+	int res_offset,
+	int img_step,
+	int tpl_step,
+	int res_step
+)
+{
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+	int i,j;
+	float4 delta;
+	float4 sum = (float4)(0, 0, 0, 0);
+	img_step   /= sizeof(*img);
+	img_offset /= sizeof(*img);
+	tpl_step   /= sizeof(*tpl);
+	tpl_offset /= sizeof(*tpl);
+	res_step   /= sizeof(*res);
+	res_offset /= sizeof(*res);
+
+	int res_idx = mad24(gidy, res_step, res_offset + gidx);
+
+	if(gidx < res_cols && gidy < res_rows)
+	{
+		for(i = 0; i < tpl_rows; i ++)
+		{
+			// get specific rows of img data
+			__global const float4 * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
+			__global const float4 * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
+			for(j = 0; j < tpl_cols; j ++)
+			{
+				//delta = convert_int4(img_ptr[j] - tpl_ptr[j]); // this alternative is incorrect
+				delta.x = img_ptr[j].x - tpl_ptr[j].x;
+				delta.y = img_ptr[j].y - tpl_ptr[j].y;
+				delta.z = img_ptr[j].z - tpl_ptr[j].z;
+				delta.w = img_ptr[j].w - tpl_ptr[j].w;
+				sum   = mad(delta, delta, sum);
+			}
+		}
+		res[res_idx] = sum.x + sum.y + sum.z + sum.w;
+	}
+}
+
+//////////////////////////////////////////////////
+// CCORR
+__kernel 
+void matchTemplate_Naive_CCORR_C1_D0
+(
+	__global const uchar * img,
+	__global const uchar * tpl,
+	__global float * res,
+	int img_rows,
+	int img_cols,
+	int tpl_rows,
+	int tpl_cols, 
+	int res_rows,
+	int res_cols,
+	int img_offset,
+	int tpl_offset,
+	int res_offset,
+	int img_step,
+	int tpl_step,
+	int res_step
+)
+{
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+	int i,j;
+	int sum = 0;
+	res_step   /= sizeof(*res);
+	res_offset /= sizeof(*res);
+
+	int res_idx = mad24(gidy, res_step, res_offset + gidx);
+
+	if(gidx < res_cols && gidy < res_rows)
+	{
+		for(i = 0; i < tpl_rows; i ++)
+		{
+			// get specific rows of img data
+			__global const uchar * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
+			__global const uchar * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
+			for(j = 0; j < tpl_cols; j ++)
+			{
+				sum = mad24(img_ptr[j], tpl_ptr[j], sum);
+			}
+		}
+		res[res_idx] = sum;
+	}
+}
+
+__kernel 
+void matchTemplate_Naive_CCORR_C1_D5
+(
+	__global const float * img,
+	__global const float * tpl,
+	__global float * res,
+	int img_rows,
+	int img_cols,
+	int tpl_rows,
+	int tpl_cols, 
+	int res_rows,
+	int res_cols,
+	int img_offset,
+	int tpl_offset,
+	int res_offset,
+	int img_step,
+	int tpl_step,
+	int res_step
+)
+{
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+	int i,j;
+	float sum = 0;
+	img_step   /= sizeof(*img);
+	img_offset /= sizeof(*img);
+	tpl_step   /= sizeof(*tpl);
+	tpl_offset /= sizeof(*tpl);
+	res_step   /= sizeof(*res);
+	res_offset /= sizeof(*res);
+
+	int res_idx = mad24(gidy, res_step, res_offset + gidx);
+
+	if(gidx < res_cols && gidy < res_rows)
+	{
+		for(i = 0; i < tpl_rows; i ++)
+		{
+			// get specific rows of img data
+			__global const float * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
+			__global const float * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
+			for(j = 0; j < tpl_cols; j ++)
+			{
+				sum = mad(img_ptr[j], tpl_ptr[j], sum);
+			}
+		}
+		res[res_idx] = sum;
+	}
+}
+
+__kernel 
+void matchTemplate_Naive_CCORR_C4_D0
+(
+	__global const uchar4 * img,
+	__global const uchar4 * tpl,
+	__global float * res,
+	int img_rows,
+	int img_cols,
+	int tpl_rows,
+	int tpl_cols, 
+	int res_rows,
+	int res_cols,
+	int img_offset,
+	int tpl_offset,
+	int res_offset,
+	int img_step,
+	int tpl_step,
+	int res_step
+)
+{
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+	int i,j;
+	int4 sum = (int4)(0, 0, 0, 0);
+	img_step   /= sizeof(*img);
+	img_offset /= sizeof(*img);
+	tpl_step   /= sizeof(*tpl);
+	tpl_offset /= sizeof(*tpl);
+	res_step   /= sizeof(*res);
+	res_offset /= sizeof(*res);
+
+	int res_idx = mad24(gidy, res_step, res_offset + gidx);
+
+	if(gidx < res_cols && gidy < res_rows)
+	{
+		for(i = 0; i < tpl_rows; i ++)
+		{
+			// get specific rows of img data
+			__global const uchar4 * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
+			__global const uchar4 * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
+			for(j = 0; j < tpl_cols; j ++)
+			{
+				sum   = mad24(convert_int4(img_ptr[j]), convert_int4(tpl_ptr[j]), sum);
+			}
+		}
+		res[res_idx] = sum.x + sum.y + sum.z + sum.w;
+	}
+}
+
+__kernel 
+void matchTemplate_Naive_CCORR_C4_D5
+(
+	__global const float4 * img,
+	__global const float4 * tpl,
+	__global float * res,
+	int img_rows,
+	int img_cols,
+	int tpl_rows,
+	int tpl_cols, 
+	int res_rows,
+	int res_cols,
+	int img_offset,
+	int tpl_offset,
+	int res_offset,
+	int img_step,
+	int tpl_step,
+	int res_step
+)
+{
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+	int i,j;
+	float4 sum = (float4)(0, 0, 0, 0);
+	img_step   /= sizeof(*img);
+	img_offset /= sizeof(*img);
+	tpl_step   /= sizeof(*tpl);
+	tpl_offset /= sizeof(*tpl);
+	res_step   /= sizeof(*res);
+	res_offset /= sizeof(*res);
+
+	int res_idx = mad24(gidy, res_step, res_offset + gidx);
+
+	if(gidx < res_cols && gidy < res_rows)
+	{
+		for(i = 0; i < tpl_rows; i ++)
+		{
+			// get specific rows of img data
+			__global const float4 * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
+			__global const float4 * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
+			for(j = 0; j < tpl_cols; j ++)
+			{
+				sum = mad(convert_float4(img_ptr[j]), convert_float4(tpl_ptr[j]), sum);
+			}
+		}
+		res[res_idx] = sum.x + sum.y + sum.z + sum.w;
+	}
+}
+
+//////////////////////////////////////////////////
+// CCOFF
+__kernel 
+void matchTemplate_Prepared_CCOFF_C1_D0
+(
+	__global float * res,
+	int img_rows,
+	int img_cols,
+	int tpl_rows,
+	int tpl_cols, 
+	int res_rows,
+	int res_cols,
+	int res_offset,
+	int res_step,
+	__global const uint * img_sums,
+	int img_sums_offset,
+	int img_sums_step,
+	float tpl_sum
+)
+{
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+	
+	img_sums_offset   /= sizeof(*img_sums);
+	img_sums_step     /= sizeof(*img_sums);
+	res_step   /= sizeof(*res);
+	res_offset /= sizeof(*res);
+	
+	int res_idx = mad24(gidy, res_step, res_offset + gidx);
+
+	if(gidx < res_cols && gidy < res_rows)
+	{
+		float sum = (float)(
+			(img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)])
+		  - (img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)]));
+		res[res_idx] -= sum * tpl_sum;
+	}
+}
+__kernel 
+void matchTemplate_Prepared_CCOFF_C4_D0
+(
+	__global float * res,
+	int img_rows,
+	int img_cols,
+	int tpl_rows,
+	int tpl_cols, 
+	int res_rows,
+	int res_cols,
+	int res_offset,
+	int res_step,
+	__global const uint * img_sums_c0,
+	__global const uint * img_sums_c1,
+	__global const uint * img_sums_c2,
+	__global const uint * img_sums_c3,
+	int img_sums_offset,
+	int img_sums_step,
+	float tpl_sum_c0,
+	float tpl_sum_c1,
+	float tpl_sum_c2,
+	float tpl_sum_c3
+)
+{
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+	
+	img_sums_offset   /= sizeof(*img_sums_c0);
+	img_sums_step     /= sizeof(*img_sums_c0);
+	res_step   /= sizeof(*res);
+	res_offset /= sizeof(*res);
+	
+	int res_idx = mad24(gidy, res_step, res_offset + gidx);
+
+	if(gidx < res_cols && gidy < res_rows)
+	{
+		float ccorr = res[res_idx];
+		ccorr -= tpl_sum_c0*(float)(
+			(img_sums_c0[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c0[SUMS_PTR(tpl_cols, 0)])
+		  - (img_sums_c0[SUMS_PTR(0, tpl_rows)] - img_sums_c0[SUMS_PTR(0, 0)]));
+		ccorr -= tpl_sum_c1*(float)(
+			(img_sums_c1[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c1[SUMS_PTR(tpl_cols, 0)])
+		  - (img_sums_c1[SUMS_PTR(0, tpl_rows)] - img_sums_c1[SUMS_PTR(0, 0)]));
+		ccorr -= tpl_sum_c2*(float)(
+			(img_sums_c2[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c2[SUMS_PTR(tpl_cols, 0)])
+		  - (img_sums_c2[SUMS_PTR(0, tpl_rows)] - img_sums_c2[SUMS_PTR(0, 0)]));
+		ccorr -= tpl_sum_c3*(float)(
+			(img_sums_c3[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c3[SUMS_PTR(tpl_cols, 0)])
+		  - (img_sums_c3[SUMS_PTR(0, tpl_rows)] - img_sums_c3[SUMS_PTR(0, 0)]));
+		res[res_idx] = ccorr;
+	}
+}
+
+__kernel
+void matchTemplate_Prepared_CCOFF_NORMED_C1_D0
+(
+	__global float * res,
+	int img_rows,
+	int img_cols,
+	int tpl_rows,
+	int tpl_cols, 
+	int res_rows,
+	int res_cols,
+	int res_offset,
+	int res_step,
+	float weight,
+	__global const uint * img_sums,
+	int img_sums_offset,
+	int img_sums_step,
+	__global const TYPE_IMAGE_SQSUM * img_sqsums,
+	int img_sqsums_offset,
+	int img_sqsums_step,
+	float tpl_sum,
+	float tpl_sqsum
+)
+{
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+	
+	img_sqsums_step   /= sizeof(*img_sqsums);
+	img_sqsums_offset /= sizeof(*img_sqsums);
+	img_sums_offset   /= sizeof(*img_sums);
+	img_sums_step     /= sizeof(*img_sums);
+	res_step   /= sizeof(*res);
+	res_offset /= sizeof(*res);
+
+	
+	int res_idx = mad24(gidy, res_step, res_offset + gidx);
+
+	if(gidx < res_cols && gidy < res_rows)
+	{
+		float image_sum_ =  (float)(
+			(img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)])
+		  - (img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)]));
+
+		float image_sqsum_ = (float)(
+			(img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) -
+			(img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)]));
+		res[res_idx] = normAcc(res[res_idx] - image_sum_ * tpl_sum,
+							   sqrt(tpl_sqsum * (image_sqsum_ - weight * image_sum_ * image_sum_)));
+	}
+}
+__kernel
+void matchTemplate_Prepared_CCOFF_NORMED_C4_D0
+(
+	__global float * res,
+	int img_rows,
+	int img_cols,
+	int tpl_rows,
+	int tpl_cols, 
+	int res_rows,
+	int res_cols,
+	int res_offset,
+	int res_step,
+	float weight,
+	__global const uint * img_sums_c0,
+	__global const uint * img_sums_c1,
+	__global const uint * img_sums_c2,
+	__global const uint * img_sums_c3,
+	int img_sums_offset,
+	int img_sums_step,
+	__global const TYPE_IMAGE_SQSUM * img_sqsums_c0,
+	__global const TYPE_IMAGE_SQSUM * img_sqsums_c1,
+	__global const TYPE_IMAGE_SQSUM * img_sqsums_c2,
+	__global const TYPE_IMAGE_SQSUM * img_sqsums_c3,
+	int img_sqsums_offset,
+	int img_sqsums_step,
+	float tpl_sum_c0,
+	float tpl_sum_c1,
+	float tpl_sum_c2,
+	float tpl_sum_c3,
+	float tpl_sqsum
+)
+{
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+	
+	img_sqsums_step   /= sizeof(*img_sqsums_c0);
+	img_sqsums_offset /= sizeof(*img_sqsums_c0);
+	img_sums_offset   /= sizeof(*img_sums_c0);
+	img_sums_step     /= sizeof(*img_sums_c0);
+	res_step   /= sizeof(*res);
+	res_offset /= sizeof(*res);
+
+	int res_idx = mad24(gidy, res_step, res_offset + gidx);
+
+	if(gidx < res_cols && gidy < res_rows)
+	{
+		float image_sum_c0 =  (float)(
+			(img_sums_c0[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c0[SUMS_PTR(tpl_cols, 0)])
+		  - (img_sums_c0[SUMS_PTR(0, tpl_rows)] - img_sums_c0[SUMS_PTR(0, 0)]));
+		float image_sum_c1 =  (float)(
+			(img_sums_c1[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c1[SUMS_PTR(tpl_cols, 0)])
+		  - (img_sums_c1[SUMS_PTR(0, tpl_rows)] - img_sums_c1[SUMS_PTR(0, 0)]));
+		float image_sum_c2 =  (float)(
+			(img_sums_c2[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c2[SUMS_PTR(tpl_cols, 0)])
+		  - (img_sums_c2[SUMS_PTR(0, tpl_rows)] - img_sums_c2[SUMS_PTR(0, 0)]));
+		float image_sum_c3 =  (float)(
+			(img_sums_c3[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c3[SUMS_PTR(tpl_cols, 0)])
+		  - (img_sums_c3[SUMS_PTR(0, tpl_rows)] - img_sums_c3[SUMS_PTR(0, 0)]));
+
+		float image_sqsum_c0 = (float)(
+			(img_sqsums_c0[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c0[SQSUMS_PTR(tpl_cols, 0)]) -
+			(img_sqsums_c0[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c0[SQSUMS_PTR(0, 0)]));
+		float image_sqsum_c1 = (float)(
+			(img_sqsums_c1[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c1[SQSUMS_PTR(tpl_cols, 0)]) -
+			(img_sqsums_c1[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c1[SQSUMS_PTR(0, 0)]));
+		float image_sqsum_c2 = (float)(
+			(img_sqsums_c2[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c2[SQSUMS_PTR(tpl_cols, 0)]) -
+			(img_sqsums_c2[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c2[SQSUMS_PTR(0, 0)]));
+		float image_sqsum_c3 = (float)(
+			(img_sqsums_c3[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c3[SQSUMS_PTR(tpl_cols, 0)]) -
+			(img_sqsums_c3[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c3[SQSUMS_PTR(0, 0)]));
+
+		float num = res[res_idx] - 
+			image_sum_c0 * tpl_sum_c0 -
+			image_sum_c1 * tpl_sum_c1 -
+			image_sum_c2 * tpl_sum_c2 -
+			image_sum_c3 * tpl_sum_c3;
+		float denum = sqrt( tpl_sqsum * (
+			image_sqsum_c0 - weight * image_sum_c0 * image_sum_c0 +
+			image_sqsum_c1 - weight * image_sum_c1 * image_sum_c1 +
+			image_sqsum_c2 - weight * image_sum_c2 * image_sum_c2 +
+			image_sqsum_c3 - weight * image_sum_c0 * image_sum_c3)
+			);
+		res[res_idx] = normAcc(num, denum);
+	}
+}
+
--- a/modules/ocl/src/kernels/pyr_down.cl
+++ b/modules/ocl/src/kernels/pyr_down.cl
@@ -0,0 +1,500 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Dachuan Zhao, dachuan@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+
+
+uchar round_uchar_uchar(uchar v)
+{ 
+	return v;
+}
+
+uchar round_uchar_int(int v)
+{ 
+    return (uchar)((uint)v <= 255 ? v : v > 0 ? 255 : 0); 
+}
+
+uchar round_uchar_float(float v)
+{ 
+	if(v - convert_int_sat_rte(v) > 1e-6 || v - convert_int_sat_rte(v) < -1e-6)
+	{
+		if(((int)v + 1) - (v + 0.5f) < 1e-6 && ((int)v + 1) - (v + 0.5f) > -1e-6)
+		{
+			v = (int)v + 0.51f;
+		}
+	}
+    int iv = convert_int_sat_rte(v);
+    return round_uchar_int(iv); 
+}
+
+uchar4 round_uchar4_uchar4(uchar4 v)
+{ 
+	return v;
+}
+
+uchar4 round_uchar4_int4(int4 v)
+{ 
+	uchar4 result;
+	result.x = (uchar)(v.x <= 255 ? v.x : v.x > 0 ? 255 : 0); 
+	result.y = (uchar)(v.y <= 255 ? v.y : v.y > 0 ? 255 : 0); 
+	result.z = (uchar)(v.z <= 255 ? v.z : v.z > 0 ? 255 : 0); 
+	result.w = (uchar)(v.w <= 255 ? v.w : v.w > 0 ? 255 : 0); 
+    return result; 
+}
+
+uchar4 round_uchar4_float4(float4 v)
+{ 
+	if(v.x - convert_int_sat_rte(v.x) > 1e-6 || v.x - convert_int_sat_rte(v.x) < -1e-6)
+	{
+		if(((int)(v.x) + 1) - (v.x + 0.5f) < 1e-6 && ((int)(v.x) + 1) - (v.x + 0.5f) > -1e-6)
+		{
+			v.x = (int)(v.x) + 0.51f;
+		}
+	}
+	if(v.y - convert_int_sat_rte(v.y) > 1e-6 || v.y - convert_int_sat_rte(v.y) < -1e-6)
+	{
+		if(((int)(v.y) + 1) - (v.y + 0.5f) < 1e-6 && ((int)(v.y) + 1) - (v.y + 0.5f) > -1e-6)
+		{
+			v.y = (int)(v.y) + 0.51f;
+		}
+	}
+	if(v.z - convert_int_sat_rte(v.z) > 1e-6 || v.z - convert_int_sat_rte(v.z) < -1e-6)
+	{
+		if(((int)(v.z) + 1) - (v.z + 0.5f) < 1e-6 && ((int)(v.z) + 1) - (v.z + 0.5f) > -1e-6)
+		{
+			v.z = (int)(v.z) + 0.51f;
+		}
+	}
+	if(v.w - convert_int_sat_rte(v.w) > 1e-6 || v.w - convert_int_sat_rte(v.w) < -1e-6)
+	{
+		if(((int)(v.w) + 1) - (v.w + 0.5f) < 1e-6 && ((int)(v.w) + 1) - (v.w + 0.5f) > -1e-6)
+		{
+			v.w = (int)(v.w) + 0.51f;
+		}
+	}
+    int4 iv = convert_int4_sat_rte(v);
+    return round_uchar4_int4(iv); 
+}
+
+
+
+
+int idx_row_low(int y, int last_row)
+{
+	if(y < 0)
+	{
+		y = -y;
+	}
+    return y % (last_row + 1);
+}
+
+int idx_row_high(int y, int last_row) 
+{
+	int i;
+	int j;
+	if(last_row - y < 0)
+	{
+		i = (y - last_row);
+	}
+	else
+	{
+		i = (last_row - y);
+	}
+	if(last_row - i < 0)
+	{
+		j = i - last_row;
+	}
+	else
+	{
+		j = last_row - i;
+	}
+    return j % (last_row + 1);
+}
+
+int idx_row(int y, int last_row)
+{
+    return idx_row_low(idx_row_high(y, last_row), last_row);
+}
+
+int idx_col_low(int x, int last_col)
+{
+	if(x < 0)
+	{
+		x = -x;
+	}
+    return x % (last_col + 1);
+}
+
+int idx_col_high(int x, int last_col) 
+{
+	int i;
+	int j;
+	if(last_col - x < 0)
+	{
+		i = (x - last_col);
+	}
+	else
+	{
+		i = (last_col - x);
+	}
+	if(last_col - i < 0)
+	{
+		j = i - last_col;
+	}
+	else
+	{
+		j = last_col - i;
+	}
+    return j % (last_col + 1);
+}
+
+int idx_col(int x, int last_col)
+{
+    return idx_col_low(idx_col_high(x, last_col), last_col);
+}
+
+__kernel void pyrDown_C1_D0(__global uchar * srcData, int srcStep, int srcOffset, int srcRows, int srcCols, __global uchar *dst, int dstStep, int dstOffset, int dstCols)
+{
+    const int x = get_group_id(0) * get_local_size(0) + get_local_id(0);
+    const int y = get_group_id(1);
+
+    __local float smem[256 + 4];
+
+    float sum;
+
+    const int src_y = 2*y;
+    const int last_row = srcRows - 1;
+    const int last_col = srcCols - 1;
+
+    sum = 0;
+
+    sum = sum + 0.0625f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[idx_col(x, last_col)]);
+    sum = sum + 0.25f   * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[idx_col(x, last_col)]);
+    sum = sum + 0.375f  * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y    , last_row) * srcStep))[idx_col(x, last_col)]);
+    sum = sum + 0.25f   * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[idx_col(x, last_col)]);
+    sum = sum + 0.0625f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[idx_col(x, last_col)]);
+
+    smem[2 + get_local_id(0)] = sum;
+
+    if (get_local_id(0) < 2)
+    {
+        const int left_x = x - 2;
+
+        sum = 0;
+
+        sum = sum + 0.0625f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[idx_col(left_x, last_col)]);
+		sum = sum + 0.25f   * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[idx_col(left_x, last_col)]);
+		sum = sum + 0.375f  * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y    , last_row) * srcStep))[idx_col(left_x, last_col)]);
+		sum = sum + 0.25f   * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[idx_col(left_x, last_col)]);
+		sum = sum + 0.0625f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[idx_col(left_x, last_col)]);
+
+        smem[get_local_id(0)] = sum;
+    }
+
+    if (get_local_id(0) > 253)
+    {
+        const int right_x = x + 2;
+
+        sum = 0;
+
+        sum = sum + 0.0625f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[idx_col(right_x, last_col)]);
+		sum = sum + 0.25f   * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[idx_col(right_x, last_col)]);
+		sum = sum + 0.375f  * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y    , last_row) * srcStep))[idx_col(right_x, last_col)]);
+		sum = sum + 0.25f   * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[idx_col(right_x, last_col)]);
+		sum = sum + 0.0625f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[idx_col(right_x, last_col)]);
+
+        smem[4 + get_local_id(0)] = sum;
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (get_local_id(0) < 128)
+    {
+        const int tid2 = get_local_id(0) * 2;
+
+        sum = 0;
+
+        sum = sum + 0.0625f * smem[2 + tid2 - 2];
+        sum = sum + 0.25f   * smem[2 + tid2 - 1];
+        sum = sum + 0.375f  * smem[2 + tid2    ];
+        sum = sum + 0.25f   * smem[2 + tid2 + 1];
+        sum = sum + 0.0625f * smem[2 + tid2 + 2];
+
+        const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
+
+        if (dst_x < dstCols)
+            dst[y * dstStep + dst_x] = round_uchar_float(sum);
+    }
+}
+
+__kernel void pyrDown_C4_D0(__global uchar4 * srcData, int srcStep, int srcOffset, int srcRows, int srcCols, __global uchar4 *dst, int dstStep, int dstOffset, int dstCols)
+{
+    const int x = get_group_id(0) * get_local_size(0) + get_local_id(0);
+    const int y = get_group_id(1);
+
+    __local float4 smem[256 + 4];
+
+    float4 sum;
+
+    const int src_y = 2*y;
+    const int last_row = srcRows - 1;
+    const int last_col = srcCols - 1;
+
+	float4 co1 = (float4)(0.375f, 0.375f, 0.375f, 0.375f);
+	float4 co2 = (float4)(0.25f, 0.25f, 0.25f, 0.25f);
+	float4 co3 = (float4)(0.0625f, 0.0625f, 0.0625f, 0.0625f);
+
+    sum = 0;
+
+	sum = sum + co3 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[idx_col(x, last_col)]));
+	sum = sum + co2   * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[idx_col(x, last_col)]));
+	sum = sum + co1  * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y    , last_row) * srcStep / 4))[idx_col(x, last_col)]));
+	sum = sum + co2   * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[idx_col(x, last_col)]));
+	sum = sum + co3 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[idx_col(x, last_col)]));
+
+	smem[2 + get_local_id(0)] = sum;
+
+	if (get_local_id(0) < 2)
+	{
+		const int left_x = x - 2;
+
+		sum = 0;
+
+		sum = sum + co3 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[idx_col(left_x, last_col)]));
+		sum = sum + co2   * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[idx_col(left_x, last_col)]));
+		sum = sum + co1  * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y    , last_row) * srcStep / 4))[idx_col(left_x, last_col)]));
+		sum = sum + co2   * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[idx_col(left_x, last_col)]));
+		sum = sum + co3 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[idx_col(left_x, last_col)]));
+
+		smem[get_local_id(0)] = sum;
+	}
+
+	if (get_local_id(0) > 253)
+	{
+		const int right_x = x + 2;
+
+		sum = 0;
+
+		sum = sum + co3 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[idx_col(right_x, last_col)]));
+		sum = sum + co2   * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[idx_col(right_x, last_col)]));
+		sum = sum + co1  * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y    , last_row) * srcStep / 4))[idx_col(right_x, last_col)]));
+		sum = sum + co2   * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[idx_col(right_x, last_col)]));
+		sum = sum + co3 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[idx_col(right_x, last_col)]));
+
+		smem[4 + get_local_id(0)] = sum;
+	}
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (get_local_id(0) < 128)
+    {
+        const int tid2 = get_local_id(0) * 2;
+
+        sum = 0;
+
+        sum = sum + co3 * smem[2 + tid2 - 2];
+        sum = sum + co2   * smem[2 + tid2 - 1];
+        sum = sum + co1  * smem[2 + tid2    ];
+        sum = sum + co2   * smem[2 + tid2 + 1];
+        sum = sum + co3 * smem[2 + tid2 + 2];
+
+        const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
+
+        if (dst_x < dstCols)
+            dst[y * dstStep / 4 + dst_x] = round_uchar4_float4(sum);
+    }
+}
+
+__kernel void pyrDown_C1_D5(__global float * srcData, int srcStep, int srcOffset, int srcRows, int srcCols, __global float *dst, int dstStep, int dstOffset, int dstCols)
+{
+    const int x = get_group_id(0) * get_local_size(0) + get_local_id(0);
+    const int y = get_group_id(1);
+
+    __local float smem[256 + 4];
+
+    float sum;
+
+    const int src_y = 2*y;
+    const int last_row = srcRows - 1;
+    const int last_col = srcCols - 1;
+
+    sum = 0;
+
+    sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[idx_col(x, last_col)];
+    sum = sum + 0.25f   * ((__global float*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[idx_col(x, last_col)];
+    sum = sum + 0.375f  * ((__global float*)((__global char*)srcData + idx_row(src_y    , last_row) * srcStep))[idx_col(x, last_col)];
+    sum = sum + 0.25f   * ((__global float*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[idx_col(x, last_col)];
+    sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[idx_col(x, last_col)];
+
+    smem[2 + get_local_id(0)] = sum;
+
+    if (get_local_id(0) < 2)
+    {
+        const int left_x = x - 2;
+
+        sum = 0;
+
+        sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[idx_col(left_x, last_col)];
+		sum = sum + 0.25f   * ((__global float*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[idx_col(left_x, last_col)];
+		sum = sum + 0.375f  * ((__global float*)((__global char*)srcData + idx_row(src_y    , last_row) * srcStep))[idx_col(left_x, last_col)];
+		sum = sum + 0.25f   * ((__global float*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[idx_col(left_x, last_col)];
+		sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[idx_col(left_x, last_col)];
+
+        smem[get_local_id(0)] = sum;
+    }
+
+    if (get_local_id(0) > 253)
+    {
+        const int right_x = x + 2;
+
+        sum = 0;
+
+        sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[idx_col(right_x, last_col)];
+		sum = sum + 0.25f   * ((__global float*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[idx_col(right_x, last_col)];
+		sum = sum + 0.375f  * ((__global float*)((__global char*)srcData + idx_row(src_y    , last_row) * srcStep))[idx_col(right_x, last_col)];
+		sum = sum + 0.25f   * ((__global float*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[idx_col(right_x, last_col)];
+		sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[idx_col(right_x, last_col)];
+
+        smem[4 + get_local_id(0)] = sum;
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (get_local_id(0) < 128)
+    {
+        const int tid2 = get_local_id(0) * 2;
+
+        sum = 0;
+
+        sum = sum + 0.0625f * smem[2 + tid2 - 2];
+        sum = sum + 0.25f   * smem[2 + tid2 - 1];
+        sum = sum + 0.375f  * smem[2 + tid2    ];
+        sum = sum + 0.25f   * smem[2 + tid2 + 1];
+        sum = sum + 0.0625f * smem[2 + tid2 + 2];
+
+        const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
+
+        if (dst_x < dstCols)
+            dst[y * dstStep / 4 + dst_x] = sum;
+    }
+}
+
+__kernel void pyrDown_C4_D5(__global float4 * srcData, int srcStep, int srcOffset, int srcRows, int srcCols, __global float4 *dst, int dstStep, int dstOffset, int dstCols)
+{
+    const int x = get_group_id(0) * get_local_size(0) + get_local_id(0);
+    const int y = get_group_id(1);
+
+    __local float4 smem[256 + 4];
+
+    float4 sum;
+
+    const int src_y = 2*y;
+    const int last_row = srcRows - 1;
+    const int last_col = srcCols - 1;
+
+	float4 co1 = (float4)(0.375f, 0.375f, 0.375f, 0.375f);
+	float4 co2 = (float4)(0.25f, 0.25f, 0.25f, 0.25f);
+	float4 co3 = (float4)(0.0625f, 0.0625f, 0.0625f, 0.0625f);
+
+    sum = 0;
+
+	sum = sum + co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[idx_col(x, last_col)];
+	sum = sum + co2   * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[idx_col(x, last_col)];
+	sum = sum + co1  * ((__global float4*)((__global char4*)srcData + idx_row(src_y    , last_row) * srcStep / 4))[idx_col(x, last_col)];
+	sum = sum + co2   * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[idx_col(x, last_col)];
+	sum = sum + co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[idx_col(x, last_col)];
+
+	smem[2 + get_local_id(0)] = sum;
+
+	if (get_local_id(0) < 2)
+	{
+		const int left_x = x - 2;
+
+		sum = 0;
+
+		sum = sum + co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[idx_col(left_x, last_col)];
+		sum = sum + co2   * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[idx_col(left_x, last_col)];
+		sum = sum + co1  * ((__global float4*)((__global char4*)srcData + idx_row(src_y    , last_row) * srcStep / 4))[idx_col(left_x, last_col)];
+		sum = sum + co2   * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[idx_col(left_x, last_col)];
+		sum = sum + co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[idx_col(left_x, last_col)];
+
+		smem[get_local_id(0)] = sum;
+	}
+
+	if (get_local_id(0) > 253)
+	{
+		const int right_x = x + 2;
+
+		sum = 0;
+
+		sum = sum + co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[idx_col(right_x, last_col)];
+		sum = sum + co2   * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[idx_col(right_x, last_col)];
+		sum = sum + co1  * ((__global float4*)((__global char4*)srcData + idx_row(src_y    , last_row) * srcStep / 4))[idx_col(right_x, last_col)];
+		sum = sum + co2   * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[idx_col(right_x, last_col)];
+		sum = sum + co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[idx_col(right_x, last_col)];
+
+		smem[4 + get_local_id(0)] = sum;
+	}
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (get_local_id(0) < 128)
+    {
+        const int tid2 = get_local_id(0) * 2;
+
+        sum = 0;
+
+        sum = sum + co3 * smem[2 + tid2 - 2];
+        sum = sum + co2   * smem[2 + tid2 - 1];
+        sum = sum + co1  * smem[2 + tid2    ];
+        sum = sum + co2   * smem[2 + tid2 + 1];
+        sum = sum + co3 * smem[2 + tid2 + 2];
+
+        const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
+
+        if (dst_x < dstCols)
+            dst[y * dstStep / 16 + dst_x] = sum;
+    }
+}
--- a/modules/ocl/src/kernels/pyr_up.cl
+++ b/modules/ocl/src/kernels/pyr_up.cl
@@ -0,0 +1,750 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Zhang Chunpeng	chunpeng@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+//#pragma OPENCL EXTENSION cl_amd_printf : enable
+
+uchar get_valid_uchar(uchar data)
+{
+	return (uchar)(data <= 255 ? data : data > 0 ? 255 : 0);
+}
+///////////////////////////////////////////////////////////////////////
+//////////////////////////  CV_8UC1  //////////////////////////////////
+///////////////////////////////////////////////////////////////////////
+__kernel void pyrUp_C1_D0(__global uchar* src,__global uchar* dst,
+				int srcRows,int dstRows,int srcCols,int dstCols,
+				int srcOffset,int dstOffset,int srcStep,int dstStep)
+{
+	const int x = get_global_id(0);
+	const int y = get_global_id(1);
+	
+	__local float s_srcPatch[10][10];
+	__local float s_dstPatch[20][16];
+	
+
+	if( get_local_id(0) < 10 && get_local_id(1) < 10 )
+	{
+		int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1;
+		int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1;
+		
+		srcx = abs(srcx);
+		srcx = min(srcCols - 1,srcx);
+		
+		srcy = abs(srcy);
+		srcy = min(srcRows -1 ,srcy);
+		
+		s_srcPatch[get_local_id(1)][get_local_id(0)] = (float)(src[srcx + srcy * srcStep]);
+		
+	}
+	
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	float sum = 0;
+	const int evenFlag = (int)((get_local_id(0) & 1) == 0);
+	const int oddFlag = (int)((get_local_id(0) & 1) != 0);
+	const bool  eveny = ((get_local_id(1) & 1) == 0);
+	const int tidx = get_local_id(0);
+	
+	if(eveny)
+	{
+		sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)];
+        sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)];
+        sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx    ) >> 1)];
+        sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)];
+        sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)];	
+  	}
+	
+	s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum;
+	
+	if (get_local_id(1) < 2)
+    {
+		sum = 0;
+
+        if (eveny)
+        {
+			sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx - 2) >> 1)];
+            sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)];
+            sum = sum + (evenFlag * 0.375f ) * s_srcPatch[0][1 + ((tidx    ) >> 1)];
+            sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)];
+            sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx + 2) >> 1)];
+         }
+
+         s_dstPatch[get_local_id(1)][get_local_id(0)] = sum;
+     }
+     
+     if (get_local_id(1) > 13)
+     {
+		sum = 0;
+
+		if (eveny)
+		{
+			sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx - 2) >> 1)];
+			sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[9][1 + ((tidx - 1) >> 1)];
+			sum = sum + (evenFlag * 0.375f ) * s_srcPatch[9][1 + ((tidx    ) >> 1)];
+			sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[9][1 + ((tidx + 1) >> 1)];
+			sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx + 2) >> 1)];
+		}
+		s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum;
+     }
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	sum = 0;
+
+    const int tidy = get_local_id(1);
+
+    sum = sum + 0.0625f * s_dstPatch[2 + tidy - 2][get_local_id(0)];
+    sum = sum + 0.25f   * s_dstPatch[2 + tidy - 1][get_local_id(0)];
+    sum = sum + 0.375f  * s_dstPatch[2 + tidy    ][get_local_id(0)];
+    sum = sum + 0.25f   * s_dstPatch[2 + tidy + 1][get_local_id(0)];
+    sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][get_local_id(0)];
+
+    if ((x < dstCols) && (y < dstRows))
+		dst[x + y * dstStep] = (float)(4.0f * sum);
+
+}
+
+///////////////////////////////////////////////////////////////////////
+//////////////////////////  CV_16UC1  /////////////////////////////////
+///////////////////////////////////////////////////////////////////////
+__kernel void pyrUp_C1_D2(__global ushort* src,__global ushort* dst,
+				int srcRows,int dstRows,int srcCols,int dstCols,
+				int srcOffset,int dstOffset,int srcStep,int dstStep)
+{
+	const int x = get_global_id(0);
+	const int y = get_global_id(1);
+	
+	__local float s_srcPatch[10][10];
+	__local float s_dstPatch[20][16];
+	
+	srcStep = srcStep >> 1;
+	dstStep = dstStep >> 1;
+	srcOffset = srcOffset >> 1;
+	dstOffset = dstOffset >> 1;
+	
+
+	if( get_local_id(0) < 10 && get_local_id(1) < 10 )
+	{
+		int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1;
+		int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1;
+		
+		srcx = abs(srcx);
+		srcx = min(srcCols - 1,srcx);
+		
+		srcy = abs(srcy);
+		srcy = min(srcRows -1 ,srcy);
+		
+		s_srcPatch[get_local_id(1)][get_local_id(0)] = (float)(src[srcx + srcy * srcStep]);
+		
+	}
+	
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	float sum = 0;
+	
+	const int evenFlag = (int)((get_local_id(0) & 1) == 0);
+	const int oddFlag = (int)((get_local_id(0) & 1) != 0);
+	const bool  eveny = ((get_local_id(1) & 1) == 0);
+	const int tidx = get_local_id(0);
+	
+	if(eveny)
+	{
+		sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)];
+        sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)];
+        sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx    ) >> 1)];
+        sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)];
+        sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)];	
+  	}
+	
+	s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum;
+	
+	if (get_local_id(1) < 2)
+    {
+		sum = 0;
+
+        if (eveny)
+        {
+			sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx - 2) >> 1)];
+            sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)];
+            sum = sum + (evenFlag * 0.375f ) * s_srcPatch[0][1 + ((tidx    ) >> 1)];
+            sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)];
+            sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx + 2) >> 1)];
+         }
+
+         s_dstPatch[get_local_id(1)][get_local_id(0)] = sum;
+     }
+     
+     if (get_local_id(1) > 13)
+     {
+		sum = 0;
+
+		if (eveny)
+		{
+			sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx - 2) >> 1)];
+			sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[9][1 + ((tidx - 1) >> 1)];
+			sum = sum + (evenFlag * 0.375f ) * s_srcPatch[9][1 + ((tidx    ) >> 1)];
+			sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[9][1 + ((tidx + 1) >> 1)];
+			sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx + 2) >> 1)];
+		}
+		s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum;
+     }
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	sum = 0;
+
+    const int tidy = get_local_id(1);
+
+    sum = sum + 0.0625f * s_dstPatch[2 + tidy - 2][get_local_id(0)];
+    sum = sum + 0.25f   * s_dstPatch[2 + tidy - 1][get_local_id(0)];
+    sum = sum + 0.375f  * s_dstPatch[2 + tidy    ][get_local_id(0)];
+    sum = sum + 0.25f   * s_dstPatch[2 + tidy + 1][get_local_id(0)];
+    sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][get_local_id(0)];
+
+    if ((x < dstCols) && (y < dstRows))
+		dst[x + y * dstStep] = (float)(4.0f * sum);
+
+}
+
+///////////////////////////////////////////////////////////////////////
+//////////////////////////  CV_32FC1  /////////////////////////////////
+///////////////////////////////////////////////////////////////////////
+__kernel void pyrUp_C1_D5(__global float* src,__global float* dst,
+				int srcRows,int dstRows,int srcCols,int dstCols,
+				int srcOffset,int dstOffset,int srcStep,int dstStep)
+{
+	const int x = get_global_id(0);
+	const int y = get_global_id(1);
+	
+	__local float s_srcPatch[10][10];
+	__local float s_dstPatch[20][16];
+	
+	srcOffset = srcOffset >> 2;
+	dstOffset = dstOffset >> 2;
+	srcStep = srcStep >> 2;
+	dstStep = dstStep >> 2;
+	
+
+	if( get_local_id(0) < 10 && get_local_id(1) < 10 )
+	{
+		int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1;
+		int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1;
+		
+		srcx = abs(srcx);
+		srcx = min(srcCols - 1,srcx);
+		
+		srcy = abs(srcy);
+		srcy = min(srcRows -1 ,srcy);
+		
+		s_srcPatch[get_local_id(1)][get_local_id(0)] = (float)(src[srcx + srcy * srcStep]);
+		
+	}
+	
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	float sum = 0;
+	const int evenFlag = (int)((get_local_id(0) & 1) == 0);
+	const int oddFlag = (int)((get_local_id(0) & 1) != 0);
+	const bool  eveny = ((get_local_id(1) & 1) == 0);
+	const int tidx = get_local_id(0);
+	
+	if(eveny)
+	{
+		sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)];
+        sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)];
+        sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx    ) >> 1)];
+        sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)];
+        sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)];	
+  	}
+	
+	s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum;
+	
+	if (get_local_id(1) < 2)
+    {
+		sum = 0;
+
+        if (eveny)
+        {
+			sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx - 2) >> 1)];
+            sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)];
+            sum = sum + (evenFlag * 0.375f ) * s_srcPatch[0][1 + ((tidx    ) >> 1)];
+            sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)];
+            sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx + 2) >> 1)];
+         }
+
+         s_dstPatch[get_local_id(1)][get_local_id(0)] = sum;
+     }
+     
+     if (get_local_id(1) > 13)
+     {
+		sum = 0;
+
+		if (eveny)
+		{
+			sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx - 2) >> 1)];
+			sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[9][1 + ((tidx - 1) >> 1)];
+			sum = sum + (evenFlag * 0.375f ) * s_srcPatch[9][1 + ((tidx    ) >> 1)];
+			sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[9][1 + ((tidx + 1) >> 1)];
+			sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx + 2) >> 1)];
+		}
+		s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum;
+     }
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	sum = 0;
+
+    const int tidy = get_local_id(1);
+
+    sum = sum + 0.0625f * s_dstPatch[2 + tidy - 2][get_local_id(0)];
+    sum = sum + 0.25f   * s_dstPatch[2 + tidy - 1][get_local_id(0)];
+    sum = sum + 0.375f  * s_dstPatch[2 + tidy    ][get_local_id(0)];
+    sum = sum + 0.25f   * s_dstPatch[2 + tidy + 1][get_local_id(0)];
+    sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][get_local_id(0)];
+
+    if ((x < dstCols) && (y < dstRows))
+		dst[x + y * dstStep] = (float)(4.0f * sum);
+
+}
+
+///////////////////////////////////////////////////////////////////////
+//////////////////////////  CV_8UC4  //////////////////////////////////
+///////////////////////////////////////////////////////////////////////
+float4 covert_uchar4_to_float4(uchar4 data)
+{
+	float4 f4Data = {0,0,0,0};
+	
+	f4Data.x = (float)data.x;
+	f4Data.y = (float)data.y;
+	f4Data.z = (float)data.z;
+	f4Data.w = (float)data.w;
+	
+	return f4Data;
+}
+
+
+uchar4 convert_float4_to_uchar4(float4 data)
+{
+	uchar4 u4Data;
+	
+	u4Data.x = get_valid_uchar(data.x);
+	u4Data.y = get_valid_uchar(data.y);
+	u4Data.z = get_valid_uchar(data.z);
+	u4Data.w = get_valid_uchar(data.w);
+	
+	return u4Data;
+}
+
+float4 int_x_float4(int leftOpr,float4 rightOpr)
+{
+	float4 result = {0,0,0,0};
+	
+	result.x = rightOpr.x * leftOpr;
+	result.y = rightOpr.y * leftOpr;
+	result.z = rightOpr.z * leftOpr;
+	result.w = rightOpr.w * leftOpr;
+	
+	return result;
+}
+
+float4 float4_x_float4(float4 leftOpr,float4 rightOpr)
+{
+	float4 result;
+	
+	result.x = leftOpr.x * rightOpr.x;
+	result.y = leftOpr.y * rightOpr.y;
+	result.z = leftOpr.z * rightOpr.z;
+	result.w = leftOpr.w * rightOpr.w;
+	
+	return result;
+}
+
+__kernel void pyrUp_C4_D0(__global uchar4* src,__global uchar4* dst,
+				int srcRows,int dstRows,int srcCols,int dstCols,
+				int srcOffset,int dstOffset,int srcStep,int dstStep)
+{
+	const int x = get_global_id(0);
+	const int y = get_global_id(1);
+	
+	__local float4 s_srcPatch[10][10];
+	__local float4 s_dstPatch[20][16];
+	
+	srcOffset >>= 2;
+	dstOffset >>= 2;
+	srcStep >>= 2;
+	dstStep >>= 2;
+	
+
+	if( get_local_id(0) < 10 && get_local_id(1) < 10 )
+	{
+		int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1;
+		int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1;
+		
+		srcx = abs(srcx);
+		srcx = min(srcCols - 1,srcx);
+		
+		srcy = abs(srcy);
+		srcy = min(srcRows -1 ,srcy);
+		
+		s_srcPatch[get_local_id(1)][get_local_id(0)] = covert_uchar4_to_float4(src[srcx + srcy * srcStep]);
+	}
+	
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	float4 sum = (float4)(0,0,0,0);
+	
+	const int evenFlag = (int)((get_local_id(0) & 1) == 0);
+	const int oddFlag = (int)((get_local_id(0) & 1) != 0);
+	const bool  eveny = ((get_local_id(1) & 1) == 0);
+	const int tidx = get_local_id(0);
+	
+	float4 co1 = (float4)(0.375f, 0.375f, 0.375f, 0.375f);
+	float4 co2 = (float4)(0.25f, 0.25f, 0.25f, 0.25f);
+	float4 co3 = (float4)(0.0625f, 0.0625f, 0.0625f, 0.0625f);
+
+	
+	if(eveny)
+	{
+		sum = sum + float4_x_float4(int_x_float4( evenFlag, co3 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)]);
+        sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)]);
+        sum = sum + float4_x_float4(int_x_float4( evenFlag, co1 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx    ) >> 1)]);
+        sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)]);
+        sum = sum + float4_x_float4(int_x_float4( evenFlag, co3 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)]);	
+                
+  	}
+	
+	s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum;
+	
+	if (get_local_id(1) < 2)
+    {
+		sum = 0;
+
+        if (eveny)
+        {
+			sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[0][1 + ((tidx - 2) >> 1)]);
+            sum = sum + float4_x_float4(int_x_float4( oddFlag , co2  ) , s_srcPatch[0][1 + ((tidx - 1) >> 1)]);
+            sum = sum + float4_x_float4(int_x_float4(evenFlag , co1 ) , s_srcPatch[0][1 + ((tidx    ) >> 1)]);
+            sum = sum + float4_x_float4(int_x_float4( oddFlag , co2  ) , s_srcPatch[0][1 + ((tidx + 1) >> 1)]);
+            sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[0][1 + ((tidx + 2) >> 1)]);
+         }
+
+         s_dstPatch[get_local_id(1)][get_local_id(0)] = sum;
+     }
+     
+     if (get_local_id(1) > 13)
+     {
+		sum = 0;
+
+		if (eveny)
+		{
+			sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[9][1 + ((tidx - 2) >> 1)]);
+			sum = sum + float4_x_float4(int_x_float4( oddFlag , co2) , s_srcPatch[9][1 + ((tidx - 1) >> 1)]);
+			sum = sum + float4_x_float4(int_x_float4(evenFlag , co1) , s_srcPatch[9][1 + ((tidx    ) >> 1)]);
+			sum = sum + float4_x_float4(int_x_float4( oddFlag , co2) , s_srcPatch[9][1 + ((tidx + 1) >> 1)]);
+			sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[9][1 + ((tidx + 2) >> 1)]);
+			
+		}
+		s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum;
+     }
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	sum = 0;
+
+    const int tidy = get_local_id(1);
+
+    sum = sum + float4_x_float4(co3 , s_dstPatch[2 + tidy - 2][get_local_id(0)]);
+    sum = sum + float4_x_float4(co2 , s_dstPatch[2 + tidy - 1][get_local_id(0)]);
+    sum = sum + float4_x_float4(co1 , s_dstPatch[2 + tidy    ][get_local_id(0)]);
+    sum = sum + float4_x_float4(co2 , s_dstPatch[2 + tidy + 1][get_local_id(0)]);
+    sum = sum + float4_x_float4(co3 , s_dstPatch[2 + tidy + 2][get_local_id(0)]);
+
+    if ((x < dstCols) && (y < dstRows))
+    {
+		dst[x + y * dstStep] = convert_float4_to_uchar4(int_x_float4(4.0f,sum));
+	}
+} 
+///////////////////////////////////////////////////////////////////////
+//////////////////////////  CV_16UC4 //////////////////////////////////
+///////////////////////////////////////////////////////////////////////
+float4 covert_ushort4_to_float4(ushort4 data)
+{
+	float4 f4Data = {0,0,0,0};
+	
+	f4Data.x = (float)data.x;
+	f4Data.y = (float)data.y;
+	f4Data.z = (float)data.z;
+	f4Data.w = (float)data.w;
+	
+	return f4Data;
+}
+
+
+ushort4 convert_float4_to_ushort4(float4 data)
+{
+	ushort4 u4Data;
+	
+	u4Data.x = (float)data.x;
+	u4Data.y = (float)data.y;
+	u4Data.z = (float)data.z;
+	u4Data.w = (float)data.w;
+	
+	return u4Data;
+}
+
+
+__kernel void pyrUp_C4_D2(__global ushort4* src,__global ushort4* dst,
+				int srcRows,int dstRows,int srcCols,int dstCols,
+				int srcOffset,int dstOffset,int srcStep,int dstStep)
+{
+	const int x = get_global_id(0);
+	const int y = get_global_id(1);
+	
+	__local float4 s_srcPatch[10][10];
+	__local float4 s_dstPatch[20][16];
+	
+	srcOffset >>= 3;
+	dstOffset >>= 3;
+	srcStep >>= 3;
+	dstStep >>= 3;
+	
+
+	if( get_local_id(0) < 10 && get_local_id(1) < 10 )
+	{
+		int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1;
+		int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1;
+		
+		srcx = abs(srcx);
+		srcx = min(srcCols - 1,srcx);
+		
+		srcy = abs(srcy);
+		srcy = min(srcRows -1 ,srcy);
+		
+		s_srcPatch[get_local_id(1)][get_local_id(0)] = covert_ushort4_to_float4(src[srcx + srcy * srcStep]);
+	}
+	
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	float4 sum = (float4)(0,0,0,0);
+	
+	const int evenFlag = (int)((get_local_id(0) & 1) == 0);
+	const int oddFlag = (int)((get_local_id(0) & 1) != 0);
+	const bool  eveny = ((get_local_id(1) & 1) == 0);
+	const int tidx = get_local_id(0);
+	
+	float4 co1 = (float4)(0.375f, 0.375f, 0.375f, 0.375f);
+	float4 co2 = (float4)(0.25f, 0.25f, 0.25f, 0.25f);
+	float4 co3 = (float4)(0.0625f, 0.0625f, 0.0625f, 0.0625f);
+
+	
+	if(eveny)
+	{
+		sum = sum + float4_x_float4(int_x_float4( evenFlag, co3 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)]);
+        sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)]);
+        sum = sum + float4_x_float4(int_x_float4( evenFlag, co1 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx    ) >> 1)]);
+        sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)]);
+        sum = sum + float4_x_float4(int_x_float4( evenFlag, co3 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)]);	
+                
+  	}
+	
+	s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum;
+	
+	if (get_local_id(1) < 2)
+    {
+		sum = 0;
+
+        if (eveny)
+        {
+			sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[0][1 + ((tidx - 2) >> 1)]);
+            sum = sum + float4_x_float4(int_x_float4( oddFlag , co2  ) , s_srcPatch[0][1 + ((tidx - 1) >> 1)]);
+            sum = sum + float4_x_float4(int_x_float4(evenFlag , co1 ) , s_srcPatch[0][1 + ((tidx    ) >> 1)]);
+            sum = sum + float4_x_float4(int_x_float4( oddFlag , co2  ) , s_srcPatch[0][1 + ((tidx + 1) >> 1)]);
+            sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[0][1 + ((tidx + 2) >> 1)]);
+         }
+
+         s_dstPatch[get_local_id(1)][get_local_id(0)] = sum;
+     }
+     
+     if (get_local_id(1) > 13)
+     {
+		sum = 0;
+
+		if (eveny)
+		{
+			sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[9][1 + ((tidx - 2) >> 1)]);
+			sum = sum + float4_x_float4(int_x_float4( oddFlag , co2) , s_srcPatch[9][1 + ((tidx - 1) >> 1)]);
+			sum = sum + float4_x_float4(int_x_float4(evenFlag , co1) , s_srcPatch[9][1 + ((tidx    ) >> 1)]);
+			sum = sum + float4_x_float4(int_x_float4( oddFlag , co2) , s_srcPatch[9][1 + ((tidx + 1) >> 1)]);
+			sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[9][1 + ((tidx + 2) >> 1)]);
+			
+		}
+		s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum;
+     }
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	sum = 0;
+
+    const int tidy = get_local_id(1);
+
+    sum = sum + float4_x_float4(co3 , s_dstPatch[2 + tidy - 2][get_local_id(0)]);
+    sum = sum + float4_x_float4(co2 , s_dstPatch[2 + tidy - 1][get_local_id(0)]);
+    sum = sum + float4_x_float4(co1 , s_dstPatch[2 + tidy    ][get_local_id(0)]);
+    sum = sum + float4_x_float4(co2 , s_dstPatch[2 + tidy + 1][get_local_id(0)]);
+    sum = sum + float4_x_float4(co3 , s_dstPatch[2 + tidy + 2][get_local_id(0)]);
+
+    if ((x < dstCols) && (y < dstRows))
+    {
+		dst[x + y * dstStep] = convert_float4_to_ushort4(int_x_float4(4.0f,sum));
+	}
+}
+
+///////////////////////////////////////////////////////////////////////
+//////////////////////////  CV_32FC4 //////////////////////////////////
+///////////////////////////////////////////////////////////////////////
+__kernel void pyrUp_C4_D5(__global float4* src,__global float4* dst,
+				int srcRows,int dstRows,int srcCols,int dstCols,
+				int srcOffset,int dstOffset,int srcStep,int dstStep)
+{
+	const int x = get_global_id(0);
+	const int y = get_global_id(1);
+	
+	__local float4 s_srcPatch[10][10];
+	__local float4 s_dstPatch[20][16];
+	
+	srcOffset >>= 4;
+	dstOffset >>= 4;
+	srcStep >>= 4;
+	dstStep >>= 4;
+	
+	
+	if( get_local_id(0) < 10 && get_local_id(1) < 10 )
+	{
+		int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1;
+		int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1;
+		
+		srcx = abs(srcx);
+		srcx = min(srcCols - 1,srcx);
+		
+		srcy = abs(srcy);
+		srcy = min(srcRows -1 ,srcy);
+		
+		s_srcPatch[get_local_id(1)][get_local_id(0)] = (float4)(src[srcx + srcy * srcStep]);
+	}
+	
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	float4 sum = (float4)(0,0,0,0);
+	
+	const int evenFlag = (int)((get_local_id(0) & 1) == 0);
+	const int oddFlag = (int)((get_local_id(0) & 1) != 0);
+	const bool  eveny = ((get_local_id(1) & 1) == 0);
+	const int tidx = get_local_id(0);
+	
+	float4 co1 = (float4)(0.375f, 0.375f, 0.375f, 0.375f);
+	float4 co2 = (float4)(0.25f, 0.25f, 0.25f, 0.25f);
+	float4 co3 = (float4)(0.0625f, 0.0625f, 0.0625f, 0.0625f);
+
+	
+	if(eveny)
+	{
+		sum = sum + float4_x_float4(int_x_float4( evenFlag, co3 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)]);
+        sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)]);
+        sum = sum + float4_x_float4(int_x_float4( evenFlag, co1 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx    ) >> 1)]);
+        sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)]);
+        sum = sum + float4_x_float4(int_x_float4( evenFlag, co3 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)]);	
+                
+  	}
+	
+	s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum;
+	
+	if (get_local_id(1) < 2)
+    {
+		sum = 0;
+
+        if (eveny)
+        {
+			sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[0][1 + ((tidx - 2) >> 1)]);
+            sum = sum + float4_x_float4(int_x_float4( oddFlag , co2  ) , s_srcPatch[0][1 + ((tidx - 1) >> 1)]);
+            sum = sum + float4_x_float4(int_x_float4(evenFlag , co1 ) , s_srcPatch[0][1 + ((tidx    ) >> 1)]);
+            sum = sum + float4_x_float4(int_x_float4( oddFlag , co2  ) , s_srcPatch[0][1 + ((tidx + 1) >> 1)]);
+            sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[0][1 + ((tidx + 2) >> 1)]);
+         }
+
+         s_dstPatch[get_local_id(1)][get_local_id(0)] = sum;
+     }
+     
+     if (get_local_id(1) > 13)
+     {
+		sum = 0;
+
+		if (eveny)
+		{
+			sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[9][1 + ((tidx - 2) >> 1)]);
+			sum = sum + float4_x_float4(int_x_float4( oddFlag , co2) , s_srcPatch[9][1 + ((tidx - 1) >> 1)]);
+			sum = sum + float4_x_float4(int_x_float4(evenFlag , co1) , s_srcPatch[9][1 + ((tidx    ) >> 1)]);
+			sum = sum + float4_x_float4(int_x_float4( oddFlag , co2) , s_srcPatch[9][1 + ((tidx + 1) >> 1)]);
+			sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[9][1 + ((tidx + 2) >> 1)]);
+			
+		}
+		s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum;
+     }
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	sum = 0;
+
+    const int tidy = get_local_id(1);
+
+    sum = sum + float4_x_float4(co3 , s_dstPatch[2 + tidy - 2][get_local_id(0)]);
+    sum = sum + float4_x_float4(co2 , s_dstPatch[2 + tidy - 1][get_local_id(0)]);
+    sum = sum + float4_x_float4(co1 , s_dstPatch[2 + tidy    ][get_local_id(0)]);
+    sum = sum + float4_x_float4(co2 , s_dstPatch[2 + tidy + 1][get_local_id(0)]);
+    sum = sum + float4_x_float4(co3 , s_dstPatch[2 + tidy + 2][get_local_id(0)]);
+
+    if ((x < dstCols) && (y < dstRows))
+    {
+		dst[x + y * dstStep] = 4.0f * sum;
+	}
+}