fixed ocl::pyrUp for 2-byte types

This commit is contained in:
Ilya Lavrenov 2013-09-15 19:56:05 +04:00
parent f9c6123439
commit a5c9d83617
2 changed files with 34 additions and 752 deletions

View File

@ -46,330 +46,25 @@
// //
//M*/ //M*/
uchar get_valid_uchar(float data)
{
return (uchar)(data <= 255 ? data : data > 0 ? 255 : 0);
}
/////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////
////////////////////////// CV_8UC1 ////////////////////////////////// //////////////////////// Generic PyrUp //////////////////////////////
/////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////
__kernel void pyrUp_C1_D0(__global uchar* src, __global uchar* dst, __kernel void pyrUp(__global Type* src, __global Type* dst,
int srcRows, int dstRows, int srcCols, int dstCols, int srcRows, int dstRows, int srcCols, int dstCols,
int srcOffset, int dstOffset, int srcStep, int dstStep) int srcOffset, int dstOffset, int srcStep, int dstStep)
{ {
const int x = get_global_id(0); const int x = get_global_id(0);
const int y = get_global_id(1); const int y = get_global_id(1);
__local float s_srcPatch[10][10];
__local float s_dstPatch[20][16];
const int tidx = get_local_id(0);
const int tidy = get_local_id(1);
const int lsizex = get_local_size(0); const int lsizex = get_local_size(0);
const int lsizey = get_local_size(1); const int lsizey = get_local_size(1);
if( tidx < 10 && tidy < 10 )
{
int srcx = mad24((int)get_group_id(0), (lsizex>>1), tidx) - 1;
int srcy = mad24((int)get_group_id(1), (lsizey>>1), tidy) - 1;
srcx = abs(srcx);
srcx = min(srcCols - 1,srcx);
srcy = abs(srcy);
srcy = min(srcRows -1 ,srcy);
s_srcPatch[tidy][tidx] = (float)(src[srcx + srcy * srcStep]);
}
barrier(CLK_LOCAL_MEM_FENCE);
float sum = 0;
const int evenFlag = (int)((tidx & 1) == 0);
const int oddFlag = (int)((tidx & 1) != 0);
const bool eveny = ((tidy & 1) == 0);
if(eveny)
{
sum = (evenFlag * 0.0625f) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 2) >> 1)];
sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 1) >> 1)];
sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx ) >> 1)];
sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 1) >> 1)];
sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 2) >> 1)];
}
s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum;
if (get_local_id(1) < 2)
{
sum = 0;
if (eveny)
{
sum = (evenFlag * 0.0625f) * s_srcPatch[lsizey - 16][1 + ((tidx - 2) >> 1)];
sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[lsizey - 16][1 + ((tidx - 1) >> 1)];
sum = sum + (evenFlag * 0.375f ) * s_srcPatch[lsizey - 16][1 + ((tidx ) >> 1)];
sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[lsizey - 16][1 + ((tidx + 1) >> 1)];
sum = sum + (evenFlag * 0.0625f) * s_srcPatch[lsizey - 16][1 + ((tidx + 2) >> 1)];
}
s_dstPatch[get_local_id(1)][get_local_id(0)] = sum;
}
if (get_local_id(1) > 13)
{
sum = 0;
if (eveny)
{
sum = (evenFlag * 0.0625f) * s_srcPatch[lsizey - 7][1 + ((tidx - 2) >> 1)];
sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[lsizey - 7][1 + ((tidx - 1) >> 1)];
sum = sum + (evenFlag * 0.375f ) * s_srcPatch[lsizey - 7][1 + ((tidx ) >> 1)];
sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[lsizey - 7][1 + ((tidx + 1) >> 1)];
sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx + 2) >> 1)];
}
s_dstPatch[4 + tidy][tidx] = sum;
}
barrier(CLK_LOCAL_MEM_FENCE);
sum = 0;
sum = 0.0625f * s_dstPatch[2 + tidy - 2][tidx];
sum = sum + 0.25f * s_dstPatch[2 + tidy - 1][tidx];
sum = sum + 0.375f * s_dstPatch[2 + tidy ][tidx];
sum = sum + 0.25f * s_dstPatch[2 + tidy + 1][tidx];
sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][tidx];
if ((x < dstCols) && (y < dstRows))
dst[x + y * dstStep] = convert_uchar_sat_rte(4.0f * sum);
}
///////////////////////////////////////////////////////////////////////
////////////////////////// CV_16UC1 /////////////////////////////////
///////////////////////////////////////////////////////////////////////
__kernel void pyrUp_C1_D2(__global ushort* src, __global ushort* dst,
int srcRows, int dstRows, int srcCols, int dstCols,
int srcOffset, int dstOffset, int srcStep, int dstStep)
{
const int x = get_global_id(0);
const int y = get_global_id(1);
__local float s_srcPatch[10][10];
__local float s_dstPatch[20][16];
srcStep = srcStep >> 1;
dstStep = dstStep >> 1;
srcOffset = srcOffset >> 1;
dstOffset = dstOffset >> 1;
if( get_local_id(0) < 10 && get_local_id(1) < 10 )
{
int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1;
int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1;
srcx = abs(srcx);
srcx = min(srcCols - 1,srcx);
srcy = abs(srcy);
srcy = min(srcRows -1 ,srcy);
s_srcPatch[get_local_id(1)][get_local_id(0)] = (float)(src[srcx + srcy * srcStep]);
}
barrier(CLK_LOCAL_MEM_FENCE);
float sum = 0;
const int evenFlag = (int)((get_local_id(0) & 1) == 0);
const int oddFlag = (int)((get_local_id(0) & 1) != 0);
const bool eveny = ((get_local_id(1) & 1) == 0);
const int tidx = get_local_id(0);
if(eveny)
{
sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)];
sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)];
sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx ) >> 1)];
sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)];
sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)];
}
s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum;
if (get_local_id(1) < 2)
{
sum = 0;
if (eveny)
{
sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx - 2) >> 1)];
sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)];
sum = sum + (evenFlag * 0.375f ) * s_srcPatch[0][1 + ((tidx ) >> 1)];
sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)];
sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx + 2) >> 1)];
}
s_dstPatch[get_local_id(1)][get_local_id(0)] = sum;
}
if (get_local_id(1) > 13)
{
sum = 0;
if (eveny)
{
sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx - 2) >> 1)];
sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[9][1 + ((tidx - 1) >> 1)];
sum = sum + (evenFlag * 0.375f ) * s_srcPatch[9][1 + ((tidx ) >> 1)];
sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[9][1 + ((tidx + 1) >> 1)];
sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx + 2) >> 1)];
}
s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum;
}
barrier(CLK_LOCAL_MEM_FENCE);
sum = 0;
const int tidy = get_local_id(1);
sum = sum + 0.0625f * s_dstPatch[2 + tidy - 2][get_local_id(0)];
sum = sum + 0.25f * s_dstPatch[2 + tidy - 1][get_local_id(0)];
sum = sum + 0.375f * s_dstPatch[2 + tidy ][get_local_id(0)];
sum = sum + 0.25f * s_dstPatch[2 + tidy + 1][get_local_id(0)];
sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][get_local_id(0)];
if ((x < dstCols) && (y < dstRows))
dst[x + y * dstStep] = convert_ushort_sat_rte(4.0f * sum);
}
///////////////////////////////////////////////////////////////////////
////////////////////////// CV_16SC1 /////////////////////////////////
///////////////////////////////////////////////////////////////////////
__kernel void pyrUp_C1_D3(__global short* src, __global short* dst,
int srcRows, int dstRows, int srcCols, int dstCols,
int srcOffset, int dstOffset, int srcStep, int dstStep)
{
const int x = get_global_id(0);
const int y = get_global_id(1);
__local float s_srcPatch[10][10];
__local float s_dstPatch[20][16];
srcStep = srcStep >> 1;
dstStep = dstStep >> 1;
srcOffset = srcOffset >> 1;
dstOffset = dstOffset >> 1;
if( get_local_id(0) < 10 && get_local_id(1) < 10 )
{
int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1;
int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1;
srcx = abs(srcx);
srcx = min(srcCols - 1,srcx);
srcy = abs(srcy);
srcy = min(srcRows -1 ,srcy);
s_srcPatch[get_local_id(1)][get_local_id(0)] = (float)(src[srcx + srcy * srcStep]);
}
barrier(CLK_LOCAL_MEM_FENCE);
float sum = 0;
const int evenFlag = (int)((get_local_id(0) & 1) == 0);
const int oddFlag = (int)((get_local_id(0) & 1) != 0);
const bool eveny = ((get_local_id(1) & 1) == 0);
const int tidx = get_local_id(0);
if (eveny)
{
sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)];
sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)];
sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx ) >> 1)];
sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)];
sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)];
}
s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum;
if (get_local_id(1) < 2)
{
sum = 0;
if (eveny)
{
sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx - 2) >> 1)];
sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)];
sum = sum + (evenFlag * 0.375f ) * s_srcPatch[0][1 + ((tidx ) >> 1)];
sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)];
sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx + 2) >> 1)];
}
s_dstPatch[get_local_id(1)][get_local_id(0)] = sum;
}
if (get_local_id(1) > 13)
{
sum = 0;
if (eveny)
{
sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx - 2) >> 1)];
sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[9][1 + ((tidx - 1) >> 1)];
sum = sum + (evenFlag * 0.375f ) * s_srcPatch[9][1 + ((tidx ) >> 1)];
sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[9][1 + ((tidx + 1) >> 1)];
sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx + 2) >> 1)];
}
s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum;
}
barrier(CLK_LOCAL_MEM_FENCE);
sum = 0;
const int tidy = get_local_id(1);
sum = sum + 0.0625f * s_dstPatch[2 + tidy - 2][get_local_id(0)];
sum = sum + 0.25f * s_dstPatch[2 + tidy - 1][get_local_id(0)];
sum = sum + 0.375f * s_dstPatch[2 + tidy ][get_local_id(0)];
sum = sum + 0.25f * s_dstPatch[2 + tidy + 1][get_local_id(0)];
sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][get_local_id(0)];
if ((x < dstCols) && (y < dstRows))
dst[x + y * dstStep] = convert_short_sat_rte(4.0f * sum);
}
///////////////////////////////////////////////////////////////////////
////////////////////////// CV_32FC1 /////////////////////////////////
///////////////////////////////////////////////////////////////////////
__kernel void pyrUp_C1_D5(__global float* src, __global float* dst,
int srcRows, int dstRows, int srcCols, int dstCols,
int srcOffset, int dstOffset, int srcStep, int dstStep)
{
const int x = get_global_id(0);
const int y = get_global_id(1);
const int tidx = get_local_id(0); const int tidx = get_local_id(0);
const int tidy = get_local_id(1); const int tidy = get_local_id(1);
const int lsizex = get_local_size(0);
const int lsizey = get_local_size(1);
__local float s_srcPatch[10][10];
__local float s_dstPatch[20][16];
srcOffset = srcOffset >> 2;
dstOffset = dstOffset >> 2;
srcStep = srcStep >> 2;
dstStep = dstStep >> 2;
__local floatType s_srcPatch[10][10];
__local floatType s_dstPatch[20][16];
if( tidx < 10 && tidy < 10 ) if( tidx < 10 && tidy < 10 )
{ {
@ -382,130 +77,27 @@ __kernel void pyrUp_C1_D5(__global float* src, __global float* dst,
srcy = abs(srcy); srcy = abs(srcy);
srcy = min(srcRows -1 ,srcy); srcy = min(srcRows -1 ,srcy);
s_srcPatch[tidy][tidx] = (float)(src[srcx + srcy * srcStep]); s_srcPatch[tidy][tidx] = convertToFloat(src[srcx + srcy * srcStep]);
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
float sum = 0; floatType sum = (floatType)0;
const int evenFlag = (int)((tidx & 1) == 0); const floatType evenFlag = (floatType)((tidx & 1) == 0);
const int oddFlag = (int)((tidx & 1) != 0); const floatType oddFlag = (floatType)((tidx & 1) != 0);
const bool eveny = ((tidy & 1) == 0); const bool eveny = ((tidy & 1) == 0);
const floatType co1 = (floatType)0.375f;
const floatType co2 = (floatType)0.25f;
const floatType co3 = (floatType)0.0625f;
if(eveny) if(eveny)
{ {
sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 2) >> 1)]; sum = ( evenFlag* co3 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 2) >> 1)];
sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 1) >> 1)];
sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx ) >> 1)];
sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 1) >> 1)];
sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 2) >> 1)];
}
s_dstPatch[2 + tidy][tidx] = sum;
if (tidy < 2)
{
sum = 0;
if (eveny)
{
sum = sum + (evenFlag * 0.0625f) * s_srcPatch[lsizey - 16][1 + ((tidx - 2) >> 1)];
sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[lsizey - 16][1 + ((tidx - 1) >> 1)];
sum = sum + (evenFlag * 0.375f ) * s_srcPatch[lsizey - 16][1 + ((tidx ) >> 1)];
sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[lsizey - 16][1 + ((tidx + 1) >> 1)];
sum = sum + (evenFlag * 0.0625f) * s_srcPatch[lsizey - 16][1 + ((tidx + 2) >> 1)];
}
s_dstPatch[tidy][tidx] = sum;
}
if (tidy > 13)
{
sum = 0;
if (eveny)
{
sum = sum + (evenFlag * 0.0625f) * s_srcPatch[lsizey - 7][1 + ((tidx - 2) >> 1)];
sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[lsizey - 7][1 + ((tidx - 1) >> 1)];
sum = sum + (evenFlag * 0.375f ) * s_srcPatch[lsizey - 7][1 + ((tidx ) >> 1)];
sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[lsizey - 7][1 + ((tidx + 1) >> 1)];
sum = sum + (evenFlag * 0.0625f) * s_srcPatch[lsizey - 7][1 + ((tidx + 2) >> 1)];
}
s_dstPatch[4 + tidy][tidx] = sum;
}
barrier(CLK_LOCAL_MEM_FENCE);
sum = 0.0625f * s_dstPatch[2 + tidy - 2][tidx];
sum = sum + 0.25f * s_dstPatch[2 + tidy - 1][tidx];
sum = sum + 0.375f * s_dstPatch[2 + tidy ][tidx];
sum = sum + 0.25f * s_dstPatch[2 + tidy + 1][tidx];
sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][tidx];
if ((x < dstCols) && (y < dstRows))
dst[x + y * dstStep] = (float)(4.0f * sum);
}
///////////////////////////////////////////////////////////////////////
////////////////////////// CV_8UC4 //////////////////////////////////
///////////////////////////////////////////////////////////////////////
__kernel void pyrUp_C4_D0(__global uchar4* src, __global uchar4* dst,
int srcRows, int dstRows, int srcCols, int dstCols,
int srcOffset, int dstOffset, int srcStep, int dstStep)
{
const int x = get_global_id(0);
const int y = get_global_id(1);
const int tidx = get_local_id(0);
const int tidy = get_local_id(1);
const int lsizex = get_local_size(0);
const int lsizey = get_local_size(1);
__local float4 s_srcPatch[10][10];
__local float4 s_dstPatch[20][16];
srcOffset >>= 2;
dstOffset >>= 2;
srcStep >>= 2;
dstStep >>= 2;
if( tidx < 10 && tidy < 10 )
{
int srcx = mad24((int)get_group_id(0), lsizex>>1, tidx) - 1;
int srcy = mad24((int)get_group_id(1), lsizey>>1, tidy) - 1;
srcx = abs(srcx);
srcx = min(srcCols - 1,srcx);
srcy = abs(srcy);
srcy = min(srcRows -1 ,srcy);
s_srcPatch[tidy][tidx] = convert_float4(src[srcx + srcy * srcStep]);
}
barrier(CLK_LOCAL_MEM_FENCE);
float4 sum = (float4)(0,0,0,0);
const float4 evenFlag = (float4)((tidx & 1) == 0);
const float4 oddFlag = (float4)((tidx & 1) != 0);
const bool eveny = ((tidy & 1) == 0);
float4 co1 = (float4)(0.375f, 0.375f, 0.375f, 0.375f);
float4 co2 = (float4)(0.25f, 0.25f, 0.25f, 0.25f);
float4 co3 = (float4)(0.0625f, 0.0625f, 0.0625f, 0.0625f);
if(eveny)
{
sum = sum + ( evenFlag * co3) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 2) >> 1)];
sum = sum + ( oddFlag * co2 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 1) >> 1)]; sum = sum + ( oddFlag * co2 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 1) >> 1)];
sum = sum + ( evenFlag* co1 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx ) >> 1)]; sum = sum + ( evenFlag* co1 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx ) >> 1)];
sum = sum + ( oddFlag * co2 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 1) >> 1)]; sum = sum + ( oddFlag * co2 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 1) >> 1)];
sum = sum + ( evenFlag* co3 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 2) >> 1)]; sum = sum + ( evenFlag* co3 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 2) >> 1)];
} }
s_dstPatch[2 + tidy][tidx] = sum; s_dstPatch[2 + tidy][tidx] = sum;
@ -516,7 +108,7 @@ __kernel void pyrUp_C4_D0(__global uchar4* src, __global uchar4* dst,
if (eveny) if (eveny)
{ {
sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-16][1 + ((tidx - 2) >> 1)]; sum = (evenFlag * co3 ) * s_srcPatch[lsizey-16][1 + ((tidx - 2) >> 1)];
sum = sum + ( oddFlag * co2 ) * s_srcPatch[lsizey-16][1 + ((tidx - 1) >> 1)]; sum = sum + ( oddFlag * co2 ) * s_srcPatch[lsizey-16][1 + ((tidx - 1) >> 1)];
sum = sum + (evenFlag * co1 ) * s_srcPatch[lsizey-16][1 + ((tidx ) >> 1)]; sum = sum + (evenFlag * co1 ) * s_srcPatch[lsizey-16][1 + ((tidx ) >> 1)];
sum = sum + ( oddFlag * co2 ) * s_srcPatch[lsizey-16][1 + ((tidx + 1) >> 1)]; sum = sum + ( oddFlag * co2 ) * s_srcPatch[lsizey-16][1 + ((tidx + 1) >> 1)];
@ -532,12 +124,11 @@ __kernel void pyrUp_C4_D0(__global uchar4* src, __global uchar4* dst,
if (eveny) if (eveny)
{ {
sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-7][1 + ((tidx - 2) >> 1)]; sum = (evenFlag * co3) * s_srcPatch[lsizey-7][1 + ((tidx - 2) >> 1)];
sum = sum + ( oddFlag * co2) * s_srcPatch[lsizey-7][1 + ((tidx - 1) >> 1)]; sum = sum + ( oddFlag * co2) * s_srcPatch[lsizey-7][1 + ((tidx - 1) >> 1)];
sum = sum + (evenFlag * co1) * s_srcPatch[lsizey-7][1 + ((tidx ) >> 1)]; sum = sum + (evenFlag * co1) * s_srcPatch[lsizey-7][1 + ((tidx ) >> 1)];
sum = sum + ( oddFlag * co2) * s_srcPatch[lsizey-7][1 + ((tidx + 1) >> 1)]; sum = sum + ( oddFlag * co2) * s_srcPatch[lsizey-7][1 + ((tidx + 1) >> 1)];
sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-7][1 + ((tidx + 2) >> 1)]; sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-7][1 + ((tidx + 2) >> 1)];
} }
s_dstPatch[4 + tidy][tidx] = sum; s_dstPatch[4 + tidy][tidx] = sum;
} }
@ -551,326 +142,5 @@ __kernel void pyrUp_C4_D0(__global uchar4* src, __global uchar4* dst,
sum = sum + co3 * s_dstPatch[2 + tidy + 2][tidx]; sum = sum + co3 * s_dstPatch[2 + tidy + 2][tidx];
if ((x < dstCols) && (y < dstRows)) if ((x < dstCols) && (y < dstRows))
dst[x + y * dstStep] = convert_uchar4_sat_rte(4.0f * sum); dst[x + y * dstStep] = convertToType(4.0f * sum);
}
///////////////////////////////////////////////////////////////////////
////////////////////////// CV_16UC4 //////////////////////////////////
///////////////////////////////////////////////////////////////////////
__kernel void pyrUp_C4_D2(__global ushort4* src, __global ushort4* dst,
int srcRows, int dstRows, int srcCols, int dstCols,
int srcOffset, int dstOffset, int srcStep, int dstStep)
{
const int x = get_global_id(0);
const int y = get_global_id(1);
__local float4 s_srcPatch[10][10];
__local float4 s_dstPatch[20][16];
srcOffset >>= 3;
dstOffset >>= 3;
srcStep >>= 3;
dstStep >>= 3;
if( get_local_id(0) < 10 && get_local_id(1) < 10 )
{
int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1;
int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1;
srcx = abs(srcx);
srcx = min(srcCols - 1,srcx);
srcy = abs(srcy);
srcy = min(srcRows -1 ,srcy);
s_srcPatch[get_local_id(1)][get_local_id(0)] = convert_float4(src[srcx + srcy * srcStep]);
}
barrier(CLK_LOCAL_MEM_FENCE);
float4 sum = (float4)(0,0,0,0);
const float4 evenFlag = (float4)((get_local_id(0) & 1) == 0);
const float4 oddFlag = (float4)((get_local_id(0) & 1) != 0);
const bool eveny = ((get_local_id(1) & 1) == 0);
const int tidx = get_local_id(0);
float4 co1 = (float4)(0.375f, 0.375f, 0.375f, 0.375f);
float4 co2 = (float4)(0.25f, 0.25f, 0.25f, 0.25f);
float4 co3 = (float4)(0.0625f, 0.0625f, 0.0625f, 0.0625f);
if(eveny)
{
sum = sum + ( evenFlag* co3 ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)];
sum = sum + ( oddFlag * co2 ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)];
sum = sum + ( evenFlag* co1 ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx ) >> 1)];
sum = sum + ( oddFlag * co2 ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)];
sum = sum + ( evenFlag* co3 ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)];
}
s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum;
if (get_local_id(1) < 2)
{
sum = 0;
if (eveny)
{
sum = sum + (evenFlag * co3 ) * s_srcPatch[0][1 + ((tidx - 2) >> 1)];
sum = sum + (oddFlag * co2 ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)];
sum = sum + (evenFlag * co1 ) * s_srcPatch[0][1 + ((tidx ) >> 1)];
sum = sum + (oddFlag * co2 ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)];
sum = sum + (evenFlag * co3 ) * s_srcPatch[0][1 + ((tidx + 2) >> 1)];
}
s_dstPatch[get_local_id(1)][get_local_id(0)] = sum;
}
if (get_local_id(1) > 13)
{
sum = 0;
if (eveny)
{
sum = sum + (evenFlag * co3) * s_srcPatch[9][1 + ((tidx - 2) >> 1)];
sum = sum + ( oddFlag * co2) * s_srcPatch[9][1 + ((tidx - 1) >> 1)];
sum = sum + (evenFlag * co1) * s_srcPatch[9][1 + ((tidx ) >> 1)];
sum = sum + ( oddFlag * co2) * s_srcPatch[9][1 + ((tidx + 1) >> 1)];
sum = sum + (evenFlag * co3) * s_srcPatch[9][1 + ((tidx + 2) >> 1)];
}
s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum;
}
barrier(CLK_LOCAL_MEM_FENCE);
sum = 0;
const int tidy = get_local_id(1);
sum = sum + co3 * s_dstPatch[2 + tidy - 2][get_local_id(0)];
sum = sum + co2 * s_dstPatch[2 + tidy - 1][get_local_id(0)];
sum = sum + co1 * s_dstPatch[2 + tidy ][get_local_id(0)];
sum = sum + co2 * s_dstPatch[2 + tidy + 1][get_local_id(0)];
sum = sum + co3 * s_dstPatch[2 + tidy + 2][get_local_id(0)];
if ((x < dstCols) && (y < dstRows))
dst[x + y * dstStep] = convert_ushort4_sat_rte(4.0f * sum);
}
///////////////////////////////////////////////////////////////////////
////////////////////////// CV_16SC4 //////////////////////////////////
///////////////////////////////////////////////////////////////////////
__kernel void pyrUp_C4_D3(__global short4* src, __global short4* dst,
int srcRows, int dstRows, int srcCols, int dstCols,
int srcOffset, int dstOffset, int srcStep, int dstStep)
{
const int x = get_global_id(0);
const int y = get_global_id(1);
__local float4 s_srcPatch[10][10];
__local float4 s_dstPatch[20][16];
srcOffset >>= 3;
dstOffset >>= 3;
srcStep >>= 3;
dstStep >>= 3;
if( get_local_id(0) < 10 && get_local_id(1) < 10 )
{
int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1;
int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1;
srcx = abs(srcx);
srcx = min(srcCols - 1,srcx);
srcy = abs(srcy);
srcy = min(srcRows -1 ,srcy);
s_srcPatch[get_local_id(1)][get_local_id(0)] = convert_float4(src[srcx + srcy * srcStep]);
}
barrier(CLK_LOCAL_MEM_FENCE);
float4 sum = (float4)(0,0,0,0);
const float4 evenFlag = (float4)((get_local_id(0) & 1) == 0);
const float4 oddFlag = (float4)((get_local_id(0) & 1) != 0);
const bool eveny = ((get_local_id(1) & 1) == 0);
const int tidx = get_local_id(0);
float4 co1 = (float4)(0.375f, 0.375f, 0.375f, 0.375f);
float4 co2 = (float4)(0.25f, 0.25f, 0.25f, 0.25f);
float4 co3 = (float4)(0.0625f, 0.0625f, 0.0625f, 0.0625f);
if(eveny)
{
sum = sum + ( evenFlag* co3 ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)];
sum = sum + ( oddFlag * co2 ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)];
sum = sum + ( evenFlag* co1 ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx ) >> 1)];
sum = sum + ( oddFlag * co2 ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)];
sum = sum + ( evenFlag* co3 ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)];
}
s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum;
if (get_local_id(1) < 2)
{
sum = 0;
if (eveny)
{
sum = sum + (evenFlag * co3 ) * s_srcPatch[0][1 + ((tidx - 2) >> 1)];
sum = sum + (oddFlag * co2 ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)];
sum = sum + (evenFlag * co1 ) * s_srcPatch[0][1 + ((tidx ) >> 1)];
sum = sum + (oddFlag * co2 ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)];
sum = sum + (evenFlag * co3 ) * s_srcPatch[0][1 + ((tidx + 2) >> 1)];
}
s_dstPatch[get_local_id(1)][get_local_id(0)] = sum;
}
if (get_local_id(1) > 13)
{
sum = 0;
if (eveny)
{
sum = sum + (evenFlag * co3) * s_srcPatch[9][1 + ((tidx - 2) >> 1)];
sum = sum + ( oddFlag * co2) * s_srcPatch[9][1 + ((tidx - 1) >> 1)];
sum = sum + (evenFlag * co1) * s_srcPatch[9][1 + ((tidx ) >> 1)];
sum = sum + ( oddFlag * co2) * s_srcPatch[9][1 + ((tidx + 1) >> 1)];
sum = sum + (evenFlag * co3) * s_srcPatch[9][1 + ((tidx + 2) >> 1)];
}
s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum;
}
barrier(CLK_LOCAL_MEM_FENCE);
sum = 0;
const int tidy = get_local_id(1);
sum = sum + co3 * s_dstPatch[2 + tidy - 2][get_local_id(0)];
sum = sum + co2 * s_dstPatch[2 + tidy - 1][get_local_id(0)];
sum = sum + co1 * s_dstPatch[2 + tidy ][get_local_id(0)];
sum = sum + co2 * s_dstPatch[2 + tidy + 1][get_local_id(0)];
sum = sum + co3 * s_dstPatch[2 + tidy + 2][get_local_id(0)];
if ((x < dstCols) && (y < dstRows))
dst[x + y * dstStep] = convert_short4_sat_rte(4.0f * sum);
}
///////////////////////////////////////////////////////////////////////
////////////////////////// CV_32FC4 //////////////////////////////////
///////////////////////////////////////////////////////////////////////
__kernel void pyrUp_C4_D5(__global float4* src, __global float4* dst,
int srcRows, int dstRows, int srcCols, int dstCols,
int srcOffset, int dstOffset, int srcStep, int dstStep)
{
const int x = get_global_id(0);
const int y = get_global_id(1);
const int tidx = get_local_id(0);
const int tidy = get_local_id(1);
const int lsizex = get_local_size(0);
const int lsizey = get_local_size(1);
__local float4 s_srcPatch[10][10];
__local float4 s_dstPatch[20][16];
srcOffset >>= 4;
dstOffset >>= 4;
srcStep >>= 4;
dstStep >>= 4;
if( tidx < 10 && tidy < 10 )
{
int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + tidx) - 1;
int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + tidy) - 1;
srcx = abs(srcx);
srcx = min(srcCols - 1,srcx);
srcy = abs(srcy);
srcy = min(srcRows -1 ,srcy);
s_srcPatch[tidy][tidx] = (float4)(src[srcx + srcy * srcStep]);
}
barrier(CLK_LOCAL_MEM_FENCE);
float4 sum = (float4)(0,0,0,0);
const float4 evenFlag = (float4)((tidx & 1) == 0);
const float4 oddFlag = (float4)((tidx & 1) != 0);
const bool eveny = ((tidy & 1) == 0);
float4 co1 = (float4)(0.375f, 0.375f, 0.375f, 0.375f);
float4 co2 = (float4)(0.25f, 0.25f, 0.25f, 0.25f);
float4 co3 = (float4)(0.0625f, 0.0625f, 0.0625f, 0.0625f);
if(eveny)
{
sum = sum + ( evenFlag* co3 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 2) >> 1)];
sum = sum + ( oddFlag * co2 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 1) >> 1)];
sum = sum + ( evenFlag* co1 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx ) >> 1)];
sum = sum + ( oddFlag * co2 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 1) >> 1)];
sum = sum + ( evenFlag* co3 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 2) >> 1)];
}
s_dstPatch[2 + tidy][tidx] = sum;
if (tidy < 2)
{
sum = 0;
if (eveny)
{
sum = sum + (evenFlag * co3 ) * s_srcPatch[lsizey-16][1 + ((tidx - 2) >> 1)];
sum = sum + (oddFlag * co2 ) * s_srcPatch[lsizey-16][1 + ((tidx - 1) >> 1)];
sum = sum + (evenFlag * co1 ) * s_srcPatch[lsizey-16][1 + ((tidx ) >> 1)];
sum = sum + ( oddFlag * co2 ) * s_srcPatch[lsizey-16][1 + ((tidx + 1) >> 1)];
sum = sum + (evenFlag * co3 ) * s_srcPatch[lsizey-16][1 + ((tidx + 2) >> 1)];
}
s_dstPatch[tidy][tidx] = sum;
}
if (tidy > 13)
{
sum = 0;
if (eveny)
{
sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-7][1 + ((tidx - 2) >> 1)];
sum = sum + ( oddFlag * co2) * s_srcPatch[lsizey-7][1 + ((tidx - 1) >> 1)];
sum = sum + (evenFlag * co1) * s_srcPatch[lsizey-7][1 + ((tidx ) >> 1)];
sum = sum + ( oddFlag * co2) * s_srcPatch[lsizey-7][1 + ((tidx + 1) >> 1)];
sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-7][1 + ((tidx + 2) >> 1)];
}
s_dstPatch[4 + tidy][tidx] = sum;
}
barrier(CLK_LOCAL_MEM_FENCE);
sum = co3 * s_dstPatch[2 + tidy - 2][tidx];
sum = sum + co2 * s_dstPatch[2 + tidy - 1][tidx];
sum = sum + co1 * s_dstPatch[2 + tidy ][tidx];
sum = sum + co2 * s_dstPatch[2 + tidy + 1][tidx];
sum = sum + co3 * s_dstPatch[2 + tidy + 2][tidx];
if ((x < dstCols) && (y < dstRows))
dst[x + y * dstStep] = 4.0f * sum;
} }

View File

@ -59,9 +59,10 @@ namespace cv
namespace ocl namespace ocl
{ {
extern const char *pyr_up; extern const char *pyr_up;
void pyrUp(const cv::ocl::oclMat &src, cv::ocl::oclMat &dst) void pyrUp(const cv::ocl::oclMat &src, cv::ocl::oclMat &dst)
{ {
int depth = src.depth(), channels = src.channels(); int depth = src.depth(), channels = src.channels(), oclChannels = src.oclchannels();
CV_Assert(depth == CV_8U || depth == CV_16U || depth == CV_16S || depth == CV_32F); CV_Assert(depth == CV_8U || depth == CV_16U || depth == CV_16S || depth == CV_32F);
CV_Assert(channels == 1 || channels == 3 || channels == 4); CV_Assert(channels == 1 || channels == 3 || channels == 4);
@ -70,7 +71,17 @@ namespace cv
Context *clCxt = src.clCxt; Context *clCxt = src.clCxt;
const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float" };
char buildOptions[250], convertString[50];
const char * const channelsString = oclChannels == 1 ? "" : "4";
sprintf(convertString, "convert_%s%s_sat_rte", typeMap[depth], channelsString);
sprintf(buildOptions, "-D Type=%s%s -D floatType=float%s -D convertToType=%s -D convertToFloat=%s",
typeMap[depth], channelsString, channelsString,
depth == CV_32F ? "" : convertString,
oclChannels == 4 ? "convert_float4" : "(float)");
const std::string kernelName = "pyrUp"; const std::string kernelName = "pyrUp";
int dststep = dst.step / dst.elemSize(), srcstep = src.step / src.elemSize();
std::vector< pair<size_t, const void *> > args; std::vector< pair<size_t, const void *> > args;
args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data)); args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data));
@ -81,14 +92,15 @@ namespace cv
args.push_back( make_pair( sizeof(cl_int), (void *)&dst.cols)); args.push_back( make_pair( sizeof(cl_int), (void *)&dst.cols));
args.push_back( make_pair( sizeof(cl_int), (void *)&src.offset)); args.push_back( make_pair( sizeof(cl_int), (void *)&src.offset));
args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset)); args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset));
args.push_back( make_pair( sizeof(cl_int), (void *)&src.step)); args.push_back( make_pair( sizeof(cl_int), (void *)&srcstep));
args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step)); args.push_back( make_pair( sizeof(cl_int), (void *)&dststep));
size_t globalThreads[3] = {dst.cols, dst.rows, 1}; size_t globalThreads[3] = {dst.cols, dst.rows, 1};
size_t localThreads[3] = {16, 16, 1}; size_t localThreads[3] = {16, 16, 1};
openCLExecuteKernel(clCxt, &pyr_up, kernelName, globalThreads, localThreads, args, src.oclchannels(), src.depth()); openCLExecuteKernel(clCxt, &pyr_up, kernelName, globalThreads, localThreads, args, -1, -1,
buildOptions);
} }
} }
} }