Merge pull request #1428 from ilya-lavrenov:ocl_pyr

This commit is contained in:
Roman Donchenko 2013-09-12 12:48:44 +04:00 committed by OpenCV Buildbot
commit ac9bc6423a
5 changed files with 791 additions and 138 deletions

View File

@ -43,37 +43,6 @@
//
//M*/
//#pragma OPENCL EXTENSION cl_amd_printf : enable
uchar round_uchar_int(int v)
{
return (uchar)((uint)v <= 255 ? v : v > 0 ? 255 : 0);
}
uchar round_uchar_float(float v)
{
return round_uchar_int(convert_int_sat_rte(v));
}
uchar4 round_uchar4_int4(int4 v)
{
uchar4 result;
result.x = (uchar)(v.x <= 255 ? v.x : v.x > 0 ? 255 : 0);
result.y = (uchar)(v.y <= 255 ? v.y : v.y > 0 ? 255 : 0);
result.z = (uchar)(v.z <= 255 ? v.z : v.z > 0 ? 255 : 0);
result.w = (uchar)(v.w <= 255 ? v.w : v.w > 0 ? 255 : 0);
return result;
}
uchar4 round_uchar4_float4(float4 v)
{
return round_uchar4_int4(convert_int4_sat_rte(v));
}
int idx_row_low(int y, int last_row)
{
return abs(y) % (last_row + 1);
@ -104,6 +73,10 @@ int idx_col(int x, int last_col)
return idx_col_low(idx_col_high(x, last_col), last_col);
}
///////////////////////////////////////////////////////////////////////
////////////////////////// CV_8UC1 ///////////////////////////////////
///////////////////////////////////////////////////////////////////////
__kernel void pyrDown_C1_D0(__global uchar * srcData, int srcStep, int srcRows, int srcCols, __global uchar *dst, int dstStep, int dstCols)
{
const int x = get_global_id(0);
@ -211,10 +184,14 @@ __kernel void pyrDown_C1_D0(__global uchar * srcData, int srcStep, int srcRows,
const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
if (dst_x < dstCols)
dst[y * dstStep + dst_x] = round_uchar_float(sum);
dst[y * dstStep + dst_x] = convert_uchar_sat_rte(sum);
}
}
///////////////////////////////////////////////////////////////////////
////////////////////////// CV_8UC4 ///////////////////////////////////
///////////////////////////////////////////////////////////////////////
__kernel void pyrDown_C4_D0(__global uchar4 * srcData, int srcStep, int srcRows, int srcCols, __global uchar4 *dst, int dstStep, int dstCols)
{
const int x = get_global_id(0);
@ -228,16 +205,16 @@ __kernel void pyrDown_C4_D0(__global uchar4 * srcData, int srcStep, int srcRows,
const int last_row = srcRows - 1;
const int last_col = srcCols - 1;
float4 co1 = 0.375f;//(float4)(0.375f, 0.375f, 0.375f, 0.375f);
float4 co2 = 0.25f;//(float4)(0.25f, 0.25f, 0.25f, 0.25f);
float4 co3 = 0.0625f;//(float4)(0.0625f, 0.0625f, 0.0625f, 0.0625f);
float4 co1 = 0.375f;
float4 co2 = 0.25f;
float4 co3 = 0.0625f;
if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2)
{
sum = co3 * convert_float4((((srcData + (src_y - 2) * srcStep / 4))[x]));
sum = sum + co2 * convert_float4((((srcData + (src_y - 1) * srcStep / 4))[x]));
sum = sum + co1 * convert_float4((((srcData + (src_y ) * srcStep / 4))[x]));
sum = sum + co2 * convert_float4((((srcData + (src_y + 1) * srcStep / 4))[x]));
sum = sum + co2 * convert_float4((((srcData + (src_y - 1) * srcStep / 4))[x]));
sum = sum + co1 * convert_float4((((srcData + (src_y ) * srcStep / 4))[x]));
sum = sum + co2 * convert_float4((((srcData + (src_y + 1) * srcStep / 4))[x]));
sum = sum + co3 * convert_float4((((srcData + (src_y + 2) * srcStep / 4))[x]));
smem[2 + get_local_id(0)] = sum;
@ -247,9 +224,9 @@ __kernel void pyrDown_C4_D0(__global uchar4 * srcData, int srcStep, int srcRows,
const int left_x = x - 2;
sum = co3 * convert_float4((((srcData + (src_y - 2) * srcStep / 4))[left_x]));
sum = sum + co2 * convert_float4((((srcData + (src_y - 1) * srcStep / 4))[left_x]));
sum = sum + co1 * convert_float4((((srcData + (src_y ) * srcStep / 4))[left_x]));
sum = sum + co2 * convert_float4((((srcData + (src_y + 1) * srcStep / 4))[left_x]));
sum = sum + co2 * convert_float4((((srcData + (src_y - 1) * srcStep / 4))[left_x]));
sum = sum + co1 * convert_float4((((srcData + (src_y ) * srcStep / 4))[left_x]));
sum = sum + co2 * convert_float4((((srcData + (src_y + 1) * srcStep / 4))[left_x]));
sum = sum + co3 * convert_float4((((srcData + (src_y + 2) * srcStep / 4))[left_x]));
smem[get_local_id(0)] = sum;
@ -260,9 +237,9 @@ __kernel void pyrDown_C4_D0(__global uchar4 * srcData, int srcStep, int srcRows,
const int right_x = x + 2;
sum = co3 * convert_float4((((srcData + (src_y - 2) * srcStep / 4))[right_x]));
sum = sum + co2 * convert_float4((((srcData + (src_y - 1) * srcStep / 4))[right_x]));
sum = sum + co1 * convert_float4((((srcData + (src_y ) * srcStep / 4))[right_x]));
sum = sum + co2 * convert_float4((((srcData + (src_y + 1) * srcStep / 4))[right_x]));
sum = sum + co2 * convert_float4((((srcData + (src_y - 1) * srcStep / 4))[right_x]));
sum = sum + co1 * convert_float4((((srcData + (src_y ) * srcStep / 4))[right_x]));
sum = sum + co2 * convert_float4((((srcData + (src_y + 1) * srcStep / 4))[right_x]));
sum = sum + co3 * convert_float4((((srcData + (src_y + 2) * srcStep / 4))[right_x]));
smem[4 + get_local_id(0)] = sum;
@ -273,9 +250,9 @@ __kernel void pyrDown_C4_D0(__global uchar4 * srcData, int srcStep, int srcRows,
int col = idx_col(x, last_col);
sum = co3 * convert_float4((((srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]));
sum = sum + co2 * convert_float4((((srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]));
sum = sum + co1 * convert_float4((((srcData + idx_row(src_y , last_row) * srcStep / 4))[col]));
sum = sum + co2 * convert_float4((((srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]));
sum = sum + co2 * convert_float4((((srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]));
sum = sum + co1 * convert_float4((((srcData + idx_row(src_y , last_row) * srcStep / 4))[col]));
sum = sum + co2 * convert_float4((((srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]));
sum = sum + co3 * convert_float4((((srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]));
smem[2 + get_local_id(0)] = sum;
@ -287,9 +264,9 @@ __kernel void pyrDown_C4_D0(__global uchar4 * srcData, int srcStep, int srcRows,
col = idx_col(left_x, last_col);
sum = co3 * convert_float4((((srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]));
sum = sum + co2 * convert_float4((((srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]));
sum = sum + co1 * convert_float4((((srcData + idx_row(src_y , last_row) * srcStep / 4))[col]));
sum = sum + co2 * convert_float4((((srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]));
sum = sum + co2 * convert_float4((((srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]));
sum = sum + co1 * convert_float4((((srcData + idx_row(src_y , last_row) * srcStep / 4))[col]));
sum = sum + co2 * convert_float4((((srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]));
sum = sum + co3 * convert_float4((((srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]));
smem[get_local_id(0)] = sum;
@ -302,9 +279,9 @@ __kernel void pyrDown_C4_D0(__global uchar4 * srcData, int srcStep, int srcRows,
col = idx_col(right_x, last_col);
sum = co3 * convert_float4((((srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]));
sum = sum + co2 * convert_float4((((srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]));
sum = sum + co1 * convert_float4((((srcData + idx_row(src_y , last_row) * srcStep / 4))[col]));
sum = sum + co2 * convert_float4((((srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]));
sum = sum + co2 * convert_float4((((srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]));
sum = sum + co1 * convert_float4((((srcData + idx_row(src_y , last_row) * srcStep / 4))[col]));
sum = sum + co2 * convert_float4((((srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]));
sum = sum + co3 * convert_float4((((srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]));
smem[4 + get_local_id(0)] = sum;
@ -318,18 +295,490 @@ __kernel void pyrDown_C4_D0(__global uchar4 * srcData, int srcStep, int srcRows,
const int tid2 = get_local_id(0) * 2;
sum = co3 * smem[2 + tid2 - 2];
sum = sum + co2 * smem[2 + tid2 - 1];
sum = sum + co1 * smem[2 + tid2 ];
sum = sum + co2 * smem[2 + tid2 + 1];
sum = sum + co2 * smem[2 + tid2 - 1];
sum = sum + co1 * smem[2 + tid2 ];
sum = sum + co2 * smem[2 + tid2 + 1];
sum = sum + co3 * smem[2 + tid2 + 2];
const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
if (dst_x < dstCols)
dst[y * dstStep / 4 + dst_x] = round_uchar4_float4(sum);
dst[y * dstStep / 4 + dst_x] = convert_uchar4_sat_rte(sum);
}
}
///////////////////////////////////////////////////////////////////////
////////////////////////// CV_16UC1 //////////////////////////////////
///////////////////////////////////////////////////////////////////////
__kernel void pyrDown_C1_D2(__global ushort * srcData, int srcStep, int srcRows, int srcCols, __global ushort *dst, int dstStep, int dstCols)
{
const int x = get_global_id(0);
const int y = get_group_id(1);
__local float smem[256 + 4];
float sum;
const int src_y = 2*y;
const int last_row = srcRows - 1;
const int last_col = srcCols - 1;
if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2)
{
sum = 0.0625f * ((__global ushort*)((__global char*)srcData + (src_y - 2) * srcStep))[x];
sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + (src_y - 1) * srcStep))[x];
sum = sum + 0.375f * ((__global ushort*)((__global char*)srcData + (src_y ) * srcStep))[x];
sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + (src_y + 1) * srcStep))[x];
sum = sum + 0.0625f * ((__global ushort*)((__global char*)srcData + (src_y + 2) * srcStep))[x];
smem[2 + get_local_id(0)] = sum;
if (get_local_id(0) < 2)
{
const int left_x = x - 2;
sum = 0.0625f * ((__global ushort*)((__global char*)srcData + (src_y - 2) * srcStep))[left_x];
sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + (src_y - 1) * srcStep))[left_x];
sum = sum + 0.375f * ((__global ushort*)((__global char*)srcData + (src_y ) * srcStep))[left_x];
sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + (src_y + 1) * srcStep))[left_x];
sum = sum + 0.0625f * ((__global ushort*)((__global char*)srcData + (src_y + 2) * srcStep))[left_x];
smem[get_local_id(0)] = sum;
}
if (get_local_id(0) > 253)
{
const int right_x = x + 2;
sum = 0.0625f * ((__global ushort*)((__global char*)srcData + (src_y - 2) * srcStep))[right_x];
sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + (src_y - 1) * srcStep))[right_x];
sum = sum + 0.375f * ((__global ushort*)((__global char*)srcData + (src_y ) * srcStep))[right_x];
sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + (src_y + 1) * srcStep))[right_x];
sum = sum + 0.0625f * ((__global ushort*)((__global char*)srcData + (src_y + 2) * srcStep))[right_x];
smem[4 + get_local_id(0)] = sum;
}
}
else
{
int col = idx_col(x, last_col);
sum = 0.0625f * ((__global ushort*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col];
sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col];
sum = sum + 0.375f * ((__global ushort*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[col];
sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col];
sum = sum + 0.0625f * ((__global ushort*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col];
smem[2 + get_local_id(0)] = sum;
if (get_local_id(0) < 2)
{
const int left_x = x - 2;
col = idx_col(left_x, last_col);
sum = 0.0625f * ((__global ushort*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col];
sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col];
sum = sum + 0.375f * ((__global ushort*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[col];
sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col];
sum = sum + 0.0625f * ((__global ushort*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col];
smem[get_local_id(0)] = sum;
}
if (get_local_id(0) > 253)
{
const int right_x = x + 2;
col = idx_col(right_x, last_col);
sum = 0.0625f * ((__global ushort*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col];
sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col];
sum = sum + 0.375f * ((__global ushort*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[col];
sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col];
sum = sum + 0.0625f * ((__global ushort*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col];
smem[4 + get_local_id(0)] = sum;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < 128)
{
const int tid2 = get_local_id(0) * 2;
sum = 0.0625f * smem[2 + tid2 - 2];
sum = sum + 0.25f * smem[2 + tid2 - 1];
sum = sum + 0.375f * smem[2 + tid2 ];
sum = sum + 0.25f * smem[2 + tid2 + 1];
sum = sum + 0.0625f * smem[2 + tid2 + 2];
const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
if (dst_x < dstCols)
dst[y * dstStep / 2 + dst_x] = convert_ushort_sat_rte(sum);
}
}
///////////////////////////////////////////////////////////////////////
////////////////////////// CV_16UC4 //////////////////////////////////
///////////////////////////////////////////////////////////////////////
__kernel void pyrDown_C4_D2(__global ushort4 * srcData, int srcStep, int srcRows, int srcCols, __global ushort4 *dst, int dstStep, int dstCols)
{
const int x = get_global_id(0);
const int y = get_group_id(1);
__local float4 smem[256 + 4];
float4 sum;
const int src_y = 2*y;
const int last_row = srcRows - 1;
const int last_col = srcCols - 1;
float4 co1 = 0.375f;
float4 co2 = 0.25f;
float4 co3 = 0.0625f;
if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2)
{
sum = co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[x]);
sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[x]);
sum = sum + co1 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[x]);
sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[x]);
sum = sum + co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[x]);
smem[2 + get_local_id(0)] = sum;
if (get_local_id(0) < 2)
{
const int left_x = x - 2;
sum = co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[left_x]);
sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[left_x]);
sum = sum + co1 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[left_x]);
sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[left_x]);
sum = sum + co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[left_x]);
smem[get_local_id(0)] = sum;
}
if (get_local_id(0) > 253)
{
const int right_x = x + 2;
sum = co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[right_x]);
sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[right_x]);
sum = sum + co1 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[right_x]);
sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[right_x]);
sum = sum + co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[right_x]);
smem[4 + get_local_id(0)] = sum;
}
}
else
{
int col = idx_col(x, last_col);
sum = co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]);
sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]);
sum = sum + co1 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col]);
sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]);
sum = sum + co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]);
smem[2 + get_local_id(0)] = sum;
if (get_local_id(0) < 2)
{
const int left_x = x - 2;
col = idx_col(left_x, last_col);
sum = co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]);
sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]);
sum = sum + co1 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col]);
sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]);
sum = sum + co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]);
smem[get_local_id(0)] = sum;
}
if (get_local_id(0) > 253)
{
const int right_x = x + 2;
col = idx_col(right_x, last_col);
sum = co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]);
sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]);
sum = sum + co1 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col]);
sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]);
sum = sum + co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]);
smem[4 + get_local_id(0)] = sum;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < 128)
{
const int tid2 = get_local_id(0) * 2;
sum = co3 * smem[2 + tid2 - 2];
sum = sum + co2 * smem[2 + tid2 - 1];
sum = sum + co1 * smem[2 + tid2 ];
sum = sum + co2 * smem[2 + tid2 + 1];
sum = sum + co3 * smem[2 + tid2 + 2];
const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
if (dst_x < dstCols)
dst[y * dstStep / 8 + dst_x] = convert_ushort4_sat_rte(sum);
}
}
///////////////////////////////////////////////////////////////////////
////////////////////////// CV_16SC1 //////////////////////////////////
///////////////////////////////////////////////////////////////////////
__kernel void pyrDown_C1_D3(__global short * srcData, int srcStep, int srcRows, int srcCols, __global short *dst, int dstStep, int dstCols)
{
const int x = get_global_id(0);
const int y = get_group_id(1);
__local float smem[256 + 4];
float sum;
const int src_y = 2*y;
const int last_row = srcRows - 1;
const int last_col = srcCols - 1;
if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2)
{
sum = 0.0625f * ((__global short*)((__global char*)srcData + (src_y - 2) * srcStep))[x];
sum = sum + 0.25f * ((__global short*)((__global char*)srcData + (src_y - 1) * srcStep))[x];
sum = sum + 0.375f * ((__global short*)((__global char*)srcData + (src_y ) * srcStep))[x];
sum = sum + 0.25f * ((__global short*)((__global char*)srcData + (src_y + 1) * srcStep))[x];
sum = sum + 0.0625f * ((__global short*)((__global char*)srcData + (src_y + 2) * srcStep))[x];
smem[2 + get_local_id(0)] = sum;
if (get_local_id(0) < 2)
{
const int left_x = x - 2;
sum = 0.0625f * ((__global short*)((__global char*)srcData + (src_y - 2) * srcStep))[left_x];
sum = sum + 0.25f * ((__global short*)((__global char*)srcData + (src_y - 1) * srcStep))[left_x];
sum = sum + 0.375f * ((__global short*)((__global char*)srcData + (src_y ) * srcStep))[left_x];
sum = sum + 0.25f * ((__global short*)((__global char*)srcData + (src_y + 1) * srcStep))[left_x];
sum = sum + 0.0625f * ((__global short*)((__global char*)srcData + (src_y + 2) * srcStep))[left_x];
smem[get_local_id(0)] = sum;
}
if (get_local_id(0) > 253)
{
const int right_x = x + 2;
sum = 0.0625f * ((__global short*)((__global char*)srcData + (src_y - 2) * srcStep))[right_x];
sum = sum + 0.25f * ((__global short*)((__global char*)srcData + (src_y - 1) * srcStep))[right_x];
sum = sum + 0.375f * ((__global short*)((__global char*)srcData + (src_y ) * srcStep))[right_x];
sum = sum + 0.25f * ((__global short*)((__global char*)srcData + (src_y + 1) * srcStep))[right_x];
sum = sum + 0.0625f * ((__global short*)((__global char*)srcData + (src_y + 2) * srcStep))[right_x];
smem[4 + get_local_id(0)] = sum;
}
}
else
{
int col = idx_col(x, last_col);
sum = 0.0625f * ((__global short*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col];
sum = sum + 0.25f * ((__global short*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col];
sum = sum + 0.375f * ((__global short*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[col];
sum = sum + 0.25f * ((__global short*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col];
sum = sum + 0.0625f * ((__global short*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col];
smem[2 + get_local_id(0)] = sum;
if (get_local_id(0) < 2)
{
const int left_x = x - 2;
col = idx_col(left_x, last_col);
sum = 0.0625f * ((__global short*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col];
sum = sum + 0.25f * ((__global short*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col];
sum = sum + 0.375f * ((__global short*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[col];
sum = sum + 0.25f * ((__global short*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col];
sum = sum + 0.0625f * ((__global short*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col];
smem[get_local_id(0)] = sum;
}
if (get_local_id(0) > 253)
{
const int right_x = x + 2;
col = idx_col(right_x, last_col);
sum = 0.0625f * ((__global short*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col];
sum = sum + 0.25f * ((__global short*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col];
sum = sum + 0.375f * ((__global short*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[col];
sum = sum + 0.25f * ((__global short*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col];
sum = sum + 0.0625f * ((__global short*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col];
smem[4 + get_local_id(0)] = sum;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < 128)
{
const int tid2 = get_local_id(0) * 2;
sum = 0.0625f * smem[2 + tid2 - 2];
sum = sum + 0.25f * smem[2 + tid2 - 1];
sum = sum + 0.375f * smem[2 + tid2 ];
sum = sum + 0.25f * smem[2 + tid2 + 1];
sum = sum + 0.0625f * smem[2 + tid2 + 2];
const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
if (dst_x < dstCols)
dst[y * dstStep / 2 + dst_x] = convert_short_sat_rte(sum);
}
}
///////////////////////////////////////////////////////////////////////
////////////////////////// CV_16SC4 //////////////////////////////////
///////////////////////////////////////////////////////////////////////
__kernel void pyrDown_C4_D3(__global short4 * srcData, int srcStep, int srcRows, int srcCols, __global short4 *dst, int dstStep, int dstCols)
{
const int x = get_global_id(0);
const int y = get_group_id(1);
__local float4 smem[256 + 4];
float4 sum;
const int src_y = 2*y;
const int last_row = srcRows - 1;
const int last_col = srcCols - 1;
float4 co1 = 0.375f;
float4 co2 = 0.25f;
float4 co3 = 0.0625f;
if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2)
{
sum = co3 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[x]);
sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[x]);
sum = sum + co1 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[x]);
sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[x]);
sum = sum + co3 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[x]);
smem[2 + get_local_id(0)] = sum;
if (get_local_id(0) < 2)
{
const int left_x = x - 2;
sum = co3 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[left_x]);
sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[left_x]);
sum = sum + co1 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[left_x]);
sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[left_x]);
sum = sum + co3 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[left_x]);
smem[get_local_id(0)] = sum;
}
if (get_local_id(0) > 253)
{
const int right_x = x + 2;
sum = co3 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[right_x]);
sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[right_x]);
sum = sum + co1 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[right_x]);
sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[right_x]);
sum = sum + co3 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[right_x]);
smem[4 + get_local_id(0)] = sum;
}
}
else
{
int col = idx_col(x, last_col);
sum = co3 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]);
sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]);
sum = sum + co1 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col]);
sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]);
sum = sum + co3 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]);
smem[2 + get_local_id(0)] = sum;
if (get_local_id(0) < 2)
{
const int left_x = x - 2;
col = idx_col(left_x, last_col);
sum = co3 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]);
sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]);
sum = sum + co1 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col]);
sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]);
sum = sum + co3 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]);
smem[get_local_id(0)] = sum;
}
if (get_local_id(0) > 253)
{
const int right_x = x + 2;
col = idx_col(right_x, last_col);
sum = co3 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]);
sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]);
sum = sum + co1 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col]);
sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]);
sum = sum + co3 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]);
smem[4 + get_local_id(0)] = sum;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < 128)
{
const int tid2 = get_local_id(0) * 2;
sum = co3 * smem[2 + tid2 - 2];
sum = sum + co2 * smem[2 + tid2 - 1];
sum = sum + co1 * smem[2 + tid2 ];
sum = sum + co2 * smem[2 + tid2 + 1];
sum = sum + co3 * smem[2 + tid2 + 2];
const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
if (dst_x < dstCols)
dst[y * dstStep / 8 + dst_x] = convert_short4_sat_rte(sum);
}
}
///////////////////////////////////////////////////////////////////////
////////////////////////// CV_32FC1 //////////////////////////////////
///////////////////////////////////////////////////////////////////////
__kernel void pyrDown_C1_D5(__global float * srcData, int srcStep, int srcRows, int srcCols, __global float *dst, int dstStep, int dstCols)
{
const int x = get_global_id(0);
@ -441,6 +890,10 @@ __kernel void pyrDown_C1_D5(__global float * srcData, int srcStep, int srcRows,
}
}
///////////////////////////////////////////////////////////////////////
////////////////////////// CV_32FC4 //////////////////////////////////
///////////////////////////////////////////////////////////////////////
__kernel void pyrDown_C4_D5(__global float4 * srcData, int srcStep, int srcRows, int srcCols, __global float4 *dst, int dstStep, int dstCols)
{
const int x = get_global_id(0);
@ -454,16 +907,16 @@ __kernel void pyrDown_C4_D5(__global float4 * srcData, int srcStep, int srcRows,
const int last_row = srcRows - 1;
const int last_col = srcCols - 1;
float4 co1 = 0.375f;//(float4)(0.375f, 0.375f, 0.375f, 0.375f);
float4 co2 = 0.25f;//(float4)(0.25f, 0.25f, 0.25f, 0.25f);
float4 co3 = 0.0625f;//(float4)(0.0625f, 0.0625f, 0.0625f, 0.0625f);
float4 co1 = 0.375f;
float4 co2 = 0.25f;
float4 co3 = 0.0625f;
if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2)
{
sum = co3 * ((__global float4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[x];
sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[x];
sum = sum + co1 * ((__global float4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[x];
sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[x];
sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[x];
sum = sum + co1 * ((__global float4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[x];
sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[x];
sum = sum + co3 * ((__global float4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[x];
smem[2 + get_local_id(0)] = sum;
@ -473,9 +926,9 @@ __kernel void pyrDown_C4_D5(__global float4 * srcData, int srcStep, int srcRows,
const int left_x = x - 2;
sum = co3 * ((__global float4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[left_x];
sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[left_x];
sum = sum + co1 * ((__global float4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[left_x];
sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[left_x];
sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[left_x];
sum = sum + co1 * ((__global float4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[left_x];
sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[left_x];
sum = sum + co3 * ((__global float4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[left_x];
smem[get_local_id(0)] = sum;
@ -486,9 +939,9 @@ __kernel void pyrDown_C4_D5(__global float4 * srcData, int srcStep, int srcRows,
const int right_x = x + 2;
sum = co3 * ((__global float4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[right_x];
sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[right_x];
sum = sum + co1 * ((__global float4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[right_x];
sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[right_x];
sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[right_x];
sum = sum + co1 * ((__global float4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[right_x];
sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[right_x];
sum = sum + co3 * ((__global float4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[right_x];
smem[4 + get_local_id(0)] = sum;
@ -499,9 +952,9 @@ __kernel void pyrDown_C4_D5(__global float4 * srcData, int srcStep, int srcRows,
int col = idx_col(x, last_col);
sum = co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col];
sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col];
sum = sum + co1 * ((__global float4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col];
sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col];
sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col];
sum = sum + co1 * ((__global float4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col];
sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col];
sum = sum + co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col];
smem[2 + get_local_id(0)] = sum;
@ -513,9 +966,9 @@ __kernel void pyrDown_C4_D5(__global float4 * srcData, int srcStep, int srcRows,
col = idx_col(left_x, last_col);
sum = co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col];
sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col];
sum = sum + co1 * ((__global float4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col];
sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col];
sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col];
sum = sum + co1 * ((__global float4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col];
sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col];
sum = sum + co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col];
smem[get_local_id(0)] = sum;
@ -528,9 +981,9 @@ __kernel void pyrDown_C4_D5(__global float4 * srcData, int srcStep, int srcRows,
col = idx_col(right_x, last_col);
sum = co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col];
sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col];
sum = sum + co1 * ((__global float4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col];
sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col];
sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col];
sum = sum + co1 * ((__global float4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col];
sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col];
sum = sum + co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col];
smem[4 + get_local_id(0)] = sum;
@ -544,9 +997,9 @@ __kernel void pyrDown_C4_D5(__global float4 * srcData, int srcStep, int srcRows,
const int tid2 = get_local_id(0) * 2;
sum = co3 * smem[2 + tid2 - 2];
sum = sum + co2 * smem[2 + tid2 - 1];
sum = sum + co1 * smem[2 + tid2 ];
sum = sum + co2 * smem[2 + tid2 + 1];
sum = sum + co2 * smem[2 + tid2 - 1];
sum = sum + co1 * smem[2 + tid2 ];
sum = sum + co2 * smem[2 + tid2 + 1];
sum = sum + co3 * smem[2 + tid2 + 2];
const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;

View File

@ -46,18 +46,18 @@
//
//M*/
//#pragma OPENCL EXTENSION cl_amd_printf : enable
uchar get_valid_uchar(float data)
{
return (uchar)(data <= 255 ? data : data > 0 ? 255 : 0);
}
///////////////////////////////////////////////////////////////////////
////////////////////////// CV_8UC1 //////////////////////////////////
///////////////////////////////////////////////////////////////////////
__kernel void pyrUp_C1_D0(__global uchar* src,__global uchar* dst,
int srcRows,int dstRows,int srcCols,int dstCols,
int srcOffset,int dstOffset,int srcStep,int dstStep)
__kernel void pyrUp_C1_D0(__global uchar* src, __global uchar* dst,
int srcRows, int dstRows, int srcCols, int dstCols,
int srcOffset, int dstOffset, int srcStep, int dstStep)
{
const int x = get_global_id(0);
const int y = get_global_id(1);
@ -144,15 +144,15 @@ __kernel void pyrUp_C1_D0(__global uchar* src,__global uchar* dst,
if ((x < dstCols) && (y < dstRows))
dst[x + y * dstStep] = convert_uchar_sat_rte(4.0f * sum);
}
///////////////////////////////////////////////////////////////////////
////////////////////////// CV_16UC1 /////////////////////////////////
///////////////////////////////////////////////////////////////////////
__kernel void pyrUp_C1_D2(__global ushort* src,__global ushort* dst,
int srcRows,int dstRows,int srcCols,int dstCols,
int srcOffset,int dstOffset,int srcStep,int dstStep)
__kernel void pyrUp_C1_D2(__global ushort* src, __global ushort* dst,
int srcRows, int dstRows, int srcCols, int dstCols,
int srcOffset, int dstOffset, int srcStep, int dstStep)
{
const int x = get_global_id(0);
const int y = get_global_id(1);
@ -245,16 +245,116 @@ __kernel void pyrUp_C1_D2(__global ushort* src,__global ushort* dst,
sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][get_local_id(0)];
if ((x < dstCols) && (y < dstRows))
dst[x + y * dstStep] = convert_short_sat_rte(4.0f * sum);
dst[x + y * dstStep] = convert_ushort_sat_rte(4.0f * sum);
}
///////////////////////////////////////////////////////////////////////
////////////////////////// CV_16SC1 /////////////////////////////////
///////////////////////////////////////////////////////////////////////
__kernel void pyrUp_C1_D3(__global short* src, __global short* dst,
int srcRows, int dstRows, int srcCols, int dstCols,
int srcOffset, int dstOffset, int srcStep, int dstStep)
{
const int x = get_global_id(0);
const int y = get_global_id(1);
__local float s_srcPatch[10][10];
__local float s_dstPatch[20][16];
srcStep = srcStep >> 1;
dstStep = dstStep >> 1;
srcOffset = srcOffset >> 1;
dstOffset = dstOffset >> 1;
if( get_local_id(0) < 10 && get_local_id(1) < 10 )
{
int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1;
int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1;
srcx = abs(srcx);
srcx = min(srcCols - 1,srcx);
srcy = abs(srcy);
srcy = min(srcRows -1 ,srcy);
s_srcPatch[get_local_id(1)][get_local_id(0)] = (float)(src[srcx + srcy * srcStep]);
}
barrier(CLK_LOCAL_MEM_FENCE);
float sum = 0;
const int evenFlag = (int)((get_local_id(0) & 1) == 0);
const int oddFlag = (int)((get_local_id(0) & 1) != 0);
const bool eveny = ((get_local_id(1) & 1) == 0);
const int tidx = get_local_id(0);
if (eveny)
{
sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)];
sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)];
sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx ) >> 1)];
sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)];
sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)];
}
s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum;
if (get_local_id(1) < 2)
{
sum = 0;
if (eveny)
{
sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx - 2) >> 1)];
sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)];
sum = sum + (evenFlag * 0.375f ) * s_srcPatch[0][1 + ((tidx ) >> 1)];
sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)];
sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx + 2) >> 1)];
}
s_dstPatch[get_local_id(1)][get_local_id(0)] = sum;
}
if (get_local_id(1) > 13)
{
sum = 0;
if (eveny)
{
sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx - 2) >> 1)];
sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[9][1 + ((tidx - 1) >> 1)];
sum = sum + (evenFlag * 0.375f ) * s_srcPatch[9][1 + ((tidx ) >> 1)];
sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[9][1 + ((tidx + 1) >> 1)];
sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx + 2) >> 1)];
}
s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum;
}
barrier(CLK_LOCAL_MEM_FENCE);
sum = 0;
const int tidy = get_local_id(1);
sum = sum + 0.0625f * s_dstPatch[2 + tidy - 2][get_local_id(0)];
sum = sum + 0.25f * s_dstPatch[2 + tidy - 1][get_local_id(0)];
sum = sum + 0.375f * s_dstPatch[2 + tidy ][get_local_id(0)];
sum = sum + 0.25f * s_dstPatch[2 + tidy + 1][get_local_id(0)];
sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][get_local_id(0)];
if ((x < dstCols) && (y < dstRows))
dst[x + y * dstStep] = convert_short_sat_rte(4.0f * sum);
}
///////////////////////////////////////////////////////////////////////
////////////////////////// CV_32FC1 /////////////////////////////////
///////////////////////////////////////////////////////////////////////
__kernel void pyrUp_C1_D5(__global float* src,__global float* dst,
int srcRows,int dstRows,int srcCols,int dstCols,
int srcOffset,int dstOffset,int srcStep,int dstStep)
__kernel void pyrUp_C1_D5(__global float* src, __global float* dst,
int srcRows, int dstRows, int srcCols, int dstCols,
int srcOffset, int dstOffset, int srcStep, int dstStep)
{
const int x = get_global_id(0);
const int y = get_global_id(1);
@ -346,15 +446,15 @@ __kernel void pyrUp_C1_D5(__global float* src,__global float* dst,
if ((x < dstCols) && (y < dstRows))
dst[x + y * dstStep] = (float)(4.0f * sum);
}
///////////////////////////////////////////////////////////////////////
////////////////////////// CV_8UC4 //////////////////////////////////
///////////////////////////////////////////////////////////////////////
__kernel void pyrUp_C4_D0(__global uchar4* src,__global uchar4* dst,
int srcRows,int dstRows,int srcCols,int dstCols,
int srcOffset,int dstOffset,int srcStep,int dstStep)
__kernel void pyrUp_C4_D0(__global uchar4* src, __global uchar4* dst,
int srcRows, int dstRows, int srcCols, int dstCols,
int srcOffset, int dstOffset, int srcStep, int dstStep)
{
const int x = get_global_id(0);
const int y = get_global_id(1);
@ -451,17 +551,16 @@ __kernel void pyrUp_C4_D0(__global uchar4* src,__global uchar4* dst,
sum = sum + co3 * s_dstPatch[2 + tidy + 2][tidx];
if ((x < dstCols) && (y < dstRows))
{
dst[x + y * dstStep] = convert_uchar4_sat_rte(4.0f * sum);
}
}
///////////////////////////////////////////////////////////////////////
////////////////////////// CV_16UC4 //////////////////////////////////
///////////////////////////////////////////////////////////////////////
__kernel void pyrUp_C4_D2(__global ushort4* src,__global ushort4* dst,
int srcRows,int dstRows,int srcCols,int dstCols,
int srcOffset,int dstOffset,int srcStep,int dstStep)
__kernel void pyrUp_C4_D2(__global ushort4* src, __global ushort4* dst,
int srcRows, int dstRows, int srcCols, int dstCols,
int srcOffset, int dstOffset, int srcStep, int dstStep)
{
const int x = get_global_id(0);
const int y = get_global_id(1);
@ -560,17 +659,123 @@ __kernel void pyrUp_C4_D2(__global ushort4* src,__global ushort4* dst,
sum = sum + co3 * s_dstPatch[2 + tidy + 2][get_local_id(0)];
if ((x < dstCols) && (y < dstRows))
{
dst[x + y * dstStep] = convert_ushort4_sat_rte(4.0f * sum);
}
///////////////////////////////////////////////////////////////////////
////////////////////////// CV_16SC4 //////////////////////////////////
///////////////////////////////////////////////////////////////////////
__kernel void pyrUp_C4_D3(__global short4* src, __global short4* dst,
int srcRows, int dstRows, int srcCols, int dstCols,
int srcOffset, int dstOffset, int srcStep, int dstStep)
{
const int x = get_global_id(0);
const int y = get_global_id(1);
__local float4 s_srcPatch[10][10];
__local float4 s_dstPatch[20][16];
srcOffset >>= 3;
dstOffset >>= 3;
srcStep >>= 3;
dstStep >>= 3;
if( get_local_id(0) < 10 && get_local_id(1) < 10 )
{
int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1;
int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1;
srcx = abs(srcx);
srcx = min(srcCols - 1,srcx);
srcy = abs(srcy);
srcy = min(srcRows -1 ,srcy);
s_srcPatch[get_local_id(1)][get_local_id(0)] = convert_float4(src[srcx + srcy * srcStep]);
}
barrier(CLK_LOCAL_MEM_FENCE);
float4 sum = (float4)(0,0,0,0);
const float4 evenFlag = (float4)((get_local_id(0) & 1) == 0);
const float4 oddFlag = (float4)((get_local_id(0) & 1) != 0);
const bool eveny = ((get_local_id(1) & 1) == 0);
const int tidx = get_local_id(0);
float4 co1 = (float4)(0.375f, 0.375f, 0.375f, 0.375f);
float4 co2 = (float4)(0.25f, 0.25f, 0.25f, 0.25f);
float4 co3 = (float4)(0.0625f, 0.0625f, 0.0625f, 0.0625f);
if(eveny)
{
sum = sum + ( evenFlag* co3 ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)];
sum = sum + ( oddFlag * co2 ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)];
sum = sum + ( evenFlag* co1 ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx ) >> 1)];
sum = sum + ( oddFlag * co2 ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)];
sum = sum + ( evenFlag* co3 ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)];
}
s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum;
if (get_local_id(1) < 2)
{
sum = 0;
if (eveny)
{
sum = sum + (evenFlag * co3 ) * s_srcPatch[0][1 + ((tidx - 2) >> 1)];
sum = sum + (oddFlag * co2 ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)];
sum = sum + (evenFlag * co1 ) * s_srcPatch[0][1 + ((tidx ) >> 1)];
sum = sum + (oddFlag * co2 ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)];
sum = sum + (evenFlag * co3 ) * s_srcPatch[0][1 + ((tidx + 2) >> 1)];
}
s_dstPatch[get_local_id(1)][get_local_id(0)] = sum;
}
if (get_local_id(1) > 13)
{
sum = 0;
if (eveny)
{
sum = sum + (evenFlag * co3) * s_srcPatch[9][1 + ((tidx - 2) >> 1)];
sum = sum + ( oddFlag * co2) * s_srcPatch[9][1 + ((tidx - 1) >> 1)];
sum = sum + (evenFlag * co1) * s_srcPatch[9][1 + ((tidx ) >> 1)];
sum = sum + ( oddFlag * co2) * s_srcPatch[9][1 + ((tidx + 1) >> 1)];
sum = sum + (evenFlag * co3) * s_srcPatch[9][1 + ((tidx + 2) >> 1)];
}
s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum;
}
barrier(CLK_LOCAL_MEM_FENCE);
sum = 0;
const int tidy = get_local_id(1);
sum = sum + co3 * s_dstPatch[2 + tidy - 2][get_local_id(0)];
sum = sum + co2 * s_dstPatch[2 + tidy - 1][get_local_id(0)];
sum = sum + co1 * s_dstPatch[2 + tidy ][get_local_id(0)];
sum = sum + co2 * s_dstPatch[2 + tidy + 1][get_local_id(0)];
sum = sum + co3 * s_dstPatch[2 + tidy + 2][get_local_id(0)];
if ((x < dstCols) && (y < dstRows))
dst[x + y * dstStep] = convert_short4_sat_rte(4.0f * sum);
}
///////////////////////////////////////////////////////////////////////
////////////////////////// CV_32FC4 //////////////////////////////////
///////////////////////////////////////////////////////////////////////
__kernel void pyrUp_C4_D5(__global float4* src,__global float4* dst,
int srcRows,int dstRows,int srcCols,int dstCols,
int srcOffset,int dstOffset,int srcStep,int dstStep)
__kernel void pyrUp_C4_D5(__global float4* src, __global float4* dst,
int srcRows, int dstRows, int srcCols, int dstCols,
int srcOffset, int dstOffset, int srcStep, int dstStep)
{
const int x = get_global_id(0);
const int y = get_global_id(1);
@ -667,7 +872,5 @@ __kernel void pyrUp_C4_D5(__global float4* src,__global float4* dst,
sum = sum + co3 * s_dstPatch[2 + tidy + 2][tidx];
if ((x < dstCols) && (y < dstRows))
{
dst[x + y * dstStep] = 4.0f * sum;
}
}

View File

@ -73,24 +73,11 @@ static void pyrdown_run(const oclMat &src, const oclMat &dst)
CV_Assert(src.depth() != CV_8S);
Context *clCxt = src.clCxt;
//int channels = dst.channels();
//int depth = dst.depth();
string kernelName = "pyrDown";
//int vector_lengths[4][7] = {{4, 0, 4, 4, 1, 1, 1},
// {4, 0, 4, 4, 1, 1, 1},
// {4, 0, 4, 4, 1, 1, 1},
// {4, 0, 4, 4, 1, 1, 1}
//};
//size_t vector_length = vector_lengths[channels-1][depth];
//int offset_cols = (dst.offset / dst.elemSize1()) & (vector_length - 1);
size_t localThreads[3] = { 256, 1, 1 };
size_t globalThreads[3] = { src.cols, dst.rows, 1};
//int dst_step1 = dst.cols * dst.elemSize();
vector<pair<size_t , const void *> > args;
args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data ));
args.push_back( make_pair( sizeof(cl_int), (void *)&src.step ));
@ -107,7 +94,9 @@ static void pyrdown_run(const oclMat &src, const oclMat &dst)
void cv::ocl::pyrDown(const oclMat &src, oclMat &dst)
{
CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
int depth = src.depth(), channels = src.channels();
CV_Assert(depth == CV_8U || depth == CV_16U || depth == CV_16S || depth == CV_32F);
CV_Assert(channels == 1 || channels == 3 || channels == 4);
dst.create((src.rows + 1) / 2, (src.cols + 1) / 2, src.type());

View File

@ -61,6 +61,11 @@ namespace cv
extern const char *pyr_up;
void pyrUp(const cv::ocl::oclMat &src, cv::ocl::oclMat &dst)
{
int depth = src.depth(), channels = src.channels();
CV_Assert(depth == CV_8U || depth == CV_16U || depth == CV_16S || depth == CV_32F);
CV_Assert(channels == 1 || channels == 3 || channels == 4);
dst.create(src.rows * 2, src.cols * 2, src.type());
Context *clCxt = src.clCxt;

View File

@ -57,60 +57,63 @@ using namespace std;
PARAM_TEST_CASE(PyrBase, MatType, int)
{
int type;
int depth;
int channels;
Mat dst_cpu;
oclMat gdst;
virtual void SetUp()
{
type = GET_PARAM(0);
depth = GET_PARAM(0);
channels = GET_PARAM(1);
}
};
/////////////////////// PyrDown //////////////////////////
struct PyrDown : PyrBase {};
typedef PyrBase PyrDown;
TEST_P(PyrDown, Mat)
{
for(int j = 0; j < LOOP_TIMES; j++)
for (int j = 0; j < LOOP_TIMES; j++)
{
Size size(MWIDTH, MHEIGHT);
Mat src = randomMat(size, CV_MAKETYPE(type, channels));
Mat src = randomMat(size, CV_MAKETYPE(depth, channels));
oclMat gsrc(src);
pyrDown(src, dst_cpu);
pyrDown(gsrc, gdst);
EXPECT_MAT_NEAR(dst_cpu, Mat(gdst), type == CV_32F ? 1e-4f : 1.0f);
EXPECT_MAT_NEAR(dst_cpu, Mat(gdst), depth == CV_32F ? 1e-4f : 1.0f);
}
}
INSTANTIATE_TEST_CASE_P(OCL_ImgProc, PyrDown, Combine(
Values(CV_8U, CV_32F), Values(1, 3, 4)));
Values(CV_8U, CV_16U, CV_16S, CV_32F),
Values(1, 3, 4)));
/////////////////////// PyrUp //////////////////////////
struct PyrUp : PyrBase {};
typedef PyrBase PyrUp;
TEST_P(PyrUp, Accuracy)
{
for(int j = 0; j < LOOP_TIMES; j++)
for (int j = 0; j < LOOP_TIMES; j++)
{
Size size(MWIDTH, MHEIGHT);
Mat src = randomMat(size, CV_MAKETYPE(type, channels));
Mat src = randomMat(size, CV_MAKETYPE(depth, channels));
oclMat gsrc(src);
pyrUp(src, dst_cpu);
pyrUp(gsrc, gdst);
EXPECT_MAT_NEAR(dst_cpu, Mat(gdst), (type == CV_32F ? 1e-4f : 1.0));
EXPECT_MAT_NEAR(dst_cpu, Mat(gdst), (depth == CV_32F ? 1e-4f : 1.0));
}
}
INSTANTIATE_TEST_CASE_P(OCL_ImgProc, PyrUp, testing::Combine(
Values(CV_8U, CV_32F), Values(1, 3, 4)));
INSTANTIATE_TEST_CASE_P(OCL_ImgProc, PyrUp, Combine(
Values(CV_8U, CV_16U, CV_16S, CV_32F),
Values(1, 3, 4)));
#endif // HAVE_OPENCL