Merge branch '2.4'

This commit is contained in:
Andrey Kamaev
2013-02-28 11:10:40 +04:00
39 changed files with 6898 additions and 386 deletions

View File

@@ -53,6 +53,10 @@ void cv::ocl::dft(const oclMat&, oclMat&, Size, int)
{
CV_Error(CV_StsNotImplemented, "OpenCL DFT is not implemented");
}
namespace cv { namespace ocl {
void fft_teardown();
}}
void cv::ocl::fft_teardown(){}
#else
#include "clAmdFft.h"
namespace cv

View File

@@ -55,13 +55,16 @@ using namespace cv::ocl;
//#define PRINT_KERNEL_RUN_TIME
#define RUN_TIMES 100
#ifndef CL_MEM_USE_PERSISTENT_MEM_AMD
#define CL_MEM_USE_PERSISTENT_MEM_AMD 0
#endif
//#define AMD_DOUBLE_DIFFER
namespace cv
{
namespace ocl
{
extern void fft_teardown();
/*
* The binary caching system to eliminate redundant program source compilation.
* Strictly, this is not a cache because we do not implement evictions right now.
@@ -69,6 +72,15 @@ namespace cv
*/
std::auto_ptr<ProgramCache> ProgramCache::programCache;
ProgramCache *programCache = NULL;
DevMemType gDeviceMemType = DEVICE_MEM_DEFAULT;
DevMemRW gDeviceMemRW = DEVICE_MEM_R_W;
int gDevMemTypeValueMap[5] = {0,
CL_MEM_ALLOC_HOST_PTR,
CL_MEM_USE_HOST_PTR,
CL_MEM_COPY_HOST_PTR,
CL_MEM_USE_PERSISTENT_MEM_AMD};
int gDevMemRWValueMap[3] = {CL_MEM_READ_WRITE, CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY};
ProgramCache::ProgramCache()
{
codeCache.clear();
@@ -110,30 +122,25 @@ namespace cv
}
////////////////////////Common OpenCL specific calls///////////////
//Info::Info()
//{
// oclplatform = 0;
// oclcontext = 0;
// devnum = 0;
//}
//Info::~Info()
//{
// release();
//}
//void Info::release()
//{
// if(oclplatform)
// {
// oclplatform = 0;
// }
// if(oclcontext)
// {
// openCLSafeCall(clReleaseContext(oclcontext));
// }
// devices.empty();
// devName.empty();
//}
struct Info::Impl
int getDevMemType(DevMemRW& rw_type, DevMemType& mem_type)
{
rw_type = gDeviceMemRW;
mem_type = gDeviceMemType;
return Context::getContext()->impl->unified_memory;
}
int setDevMemType(DevMemRW rw_type, DevMemType mem_type)
{
if( (mem_type == DEVICE_MEM_PM && Context::getContext()->impl->unified_memory == 0) ||
mem_type == DEVICE_MEM_UHP ||
mem_type == DEVICE_MEM_CHP )
return -1;
gDeviceMemRW = rw_type;
gDeviceMemType = mem_type;
return 0;
}
struct Info::Impl
{
cl_platform_id oclplatform;
std::vector<cl_device_id> devices;
@@ -287,11 +294,8 @@ namespace cv
}
void *getoclContext()
{
return &(Context::getContext()->impl->clContext);
}
void *getoclCommandQueue()
@@ -316,10 +320,16 @@ namespace cv
void openCLMallocPitch(Context *clCxt, void **dev_ptr, size_t *pitch,
size_t widthInBytes, size_t height)
{
openCLMallocPitchEx(clCxt, dev_ptr, pitch, widthInBytes, height, gDeviceMemRW, gDeviceMemType);
}
void openCLMallocPitchEx(Context *clCxt, void **dev_ptr, size_t *pitch,
size_t widthInBytes, size_t height, DevMemRW rw_type, DevMemType mem_type)
{
cl_int status;
*dev_ptr = clCreateBuffer(clCxt->impl->clContext, CL_MEM_READ_WRITE,
*dev_ptr = clCreateBuffer(clCxt->impl->clContext, gDevMemRWValueMap[rw_type]|gDevMemTypeValueMap[mem_type],
widthInBytes * height, 0, &status);
openCLVerifyCall(status);
*pitch = widthInBytes;
@@ -834,6 +844,11 @@ namespace cv
clcxt->impl->double_support = oclinfo.impl->double_support;
//extra options to recognize compiler options
memcpy(clcxt->impl->extra_options, oclinfo.impl->extra_options, 512);
cl_bool unfymem = false;
openCLSafeCall(clGetDeviceInfo(clcxt->impl->devices, CL_DEVICE_HOST_UNIFIED_MEMORY,
sizeof(cl_bool), (void *)&unfymem, NULL));
if(unfymem)
clcxt->impl->unified_memory = 1;
}
Context::Context()
{
@@ -850,6 +865,7 @@ namespace cv
impl->double_support = 0;
//extra options to recognize vendor specific fp64 extensions
memset(impl->extra_options, 0, 512);
impl->unified_memory = 0;
programCache = ProgramCache::getProgramCache();
}
@@ -874,6 +890,7 @@ namespace cv
}
void Info::release()
{
fft_teardown();
if(impl->oclplatform)
{
impl->oclplatform = 0;

View File

@@ -45,6 +45,7 @@
#include <iomanip>
#include "precomp.hpp"
#include "mcwutil.hpp"
using namespace cv;
using namespace cv::ocl;
@@ -230,73 +231,10 @@ void interpolate::blendFrames(const oclMat &frame0, const oclMat &/*frame1*/, co
void interpolate::bindImgTex(const oclMat &img, cl_mem &texture)
{
cl_image_format format;
int err;
int depth = img.depth();
int channels = img.channels();
switch(depth)
{
case CV_8U:
format.image_channel_data_type = CL_UNSIGNED_INT8;
break;
case CV_32S:
format.image_channel_data_type = CL_UNSIGNED_INT32;
break;
case CV_32F:
format.image_channel_data_type = CL_FLOAT;
break;
default:
throw std::exception();
break;
}
switch(channels)
{
case 1:
format.image_channel_order = CL_R;
break;
case 3:
format.image_channel_order = CL_RGB;
break;
case 4:
format.image_channel_order = CL_RGBA;
break;
default:
throw std::exception();
break;
}
if(texture)
{
openCLFree(texture);
}
#ifdef CL_VERSION_1_2
cl_image_desc desc;
desc.image_type = CL_MEM_OBJECT_IMAGE2D;
desc.image_width = img.step / img.elemSize();
desc.image_height = img.rows;
desc.image_depth = 0;
desc.image_array_size = 1;
desc.image_row_pitch = 0;
desc.image_slice_pitch = 0;
desc.buffer = NULL;
desc.num_mip_levels = 0;
desc.num_samples = 0;
texture = clCreateImage(Context::getContext()->impl->clContext, CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
#else
texture = clCreateImage2D(
Context::getContext()->impl->clContext,
CL_MEM_READ_WRITE,
&format,
img.step / img.elemSize(),
img.rows,
0,
NULL,
&err);
#endif
size_t origin[] = { 0, 0, 0 };
size_t region[] = { img.step / img.elemSize(), img.rows, 1 };
clEnqueueCopyBufferToImage(img.clCxt->impl->clCmdQueue, (cl_mem)img.data, texture, 0, origin, region, 0, NULL, 0);
openCLSafeCall(err);
texture = bindTexture(img);
}

View File

@@ -203,8 +203,8 @@ __kernel void YUV2RGB(int cols,int rows,int src_step,int dst_step,int channels,
__constant int ITUR_BT_601_CY = 1220542;
__constant int ITUR_BT_601_CUB = 2116026;
__constant int ITUR_BT_601_CUG = -409993;
__constant int ITUR_BT_601_CVG = -852492;
__constant int ITUR_BT_601_CUG = 409993;
__constant int ITUR_BT_601_CVG = 852492;
__constant int ITUR_BT_601_CVR = 1673527;
__constant int ITUR_BT_601_SHIFT = 20;
@@ -229,7 +229,7 @@ __kernel void YUV2RGBA_NV12(int cols,int rows,int src_step,int dst_step,
int V = usrc[1] - 128;
int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * V;
int guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * V + ITUR_BT_601_CUG * U;
int guv = (1 << (ITUR_BT_601_SHIFT - 1)) - ITUR_BT_601_CVG * V - ITUR_BT_601_CUG * U;
int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * U;
Y1 = max(0, Y1 - 16) * ITUR_BT_601_CY;

View File

@@ -0,0 +1,938 @@
#if defined (DOUBLE_SUPPORT)
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#else
typedef float double;
typedef float4 double4;
#define convert_double4 convert_float4
#endif
//#pragma OPENCL EXTENSION cl_amd_printf:enable
//#if defined (DOUBLE_SUPPORT)
__kernel void icvContourMoments(int contour_total,
__global float* reader_oclmat_data,
__global double* dst_a00,
__global double* dst_a10,
__global double* dst_a01,
__global double* dst_a20,
__global double* dst_a11,
__global double* dst_a02,
__global double* dst_a30,
__global double* dst_a21,
__global double* dst_a12,
__global double* dst_a03)
{
double xi_1, yi_1, xi_12, yi_12, xi, yi, xi2, yi2, dxy, xii_1, yii_1;
int idx = get_global_id(0);
xi_1 = *(reader_oclmat_data + (get_global_id(0) << 1));
yi_1 = *(reader_oclmat_data + (get_global_id(0) << 1) + 1);
xi_12 = xi_1 * xi_1;
yi_12 = yi_1 * yi_1;
if(idx == contour_total - 1)
{
xi = *(reader_oclmat_data);
yi = *(reader_oclmat_data + 1);
}
else
{
xi = *(reader_oclmat_data + (idx + 1) * 2);
yi = *(reader_oclmat_data + (idx + 1) * 2 + 1);
}
xi2 = xi * xi;
yi2 = yi * yi;
dxy = xi_1 * yi - xi * yi_1;
xii_1 = xi_1 + xi;
yii_1 = yi_1 + yi;
dst_a00[idx] = dxy;
dst_a10[idx] = dxy * xii_1;
dst_a01[idx] = dxy * yii_1;
dst_a20[idx] = dxy * (xi_1 * xii_1 + xi2);
dst_a11[idx] = dxy * (xi_1 * (yii_1 + yi_1) + xi * (yii_1 + yi));
dst_a02[idx] = dxy * (yi_1 * yii_1 + yi2);
dst_a30[idx] = dxy * xii_1 * (xi_12 + xi2);
dst_a03[idx] = dxy * yii_1 * (yi_12 + yi2);
dst_a21[idx] =
dxy * (xi_12 * (3 * yi_1 + yi) + 2 * xi * xi_1 * yii_1 +
xi2 * (yi_1 + 3 * yi));
dst_a12[idx] =
dxy * (yi_12 * (3 * xi_1 + xi) + 2 * yi * yi_1 * xii_1 +
yi2 * (xi_1 + 3 * xi));
}
//#endif
//#if defined (DOUBLE_SUPPORT)
__kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
__global double* dst_m00,
__global double* dst_m10,
__global double* dst_m01,
__global double* dst_m20,
__global double* dst_m11,
__global double* dst_m02,
__global double* dst_m30,
__global double* dst_m21,
__global double* dst_m12,
__global double* dst_m03,
int dst_cols, int dst_step, int type, int depth, int cn, int coi, int binary, int TILE_SIZE)
{
uchar tmp_coi[16]; // get the coi data
uchar16 tmp[16];
int VLEN_C = 16; // vector length of uchar
int gidy = get_global_id(0);
int gidx = get_global_id(1);
int wgidy = get_group_id(0);
int wgidx = get_group_id(1);
int lidy = get_local_id(0);
int lidx = get_local_id(1);
int y = wgidy*TILE_SIZE; // vector length of uchar
int x = wgidx*TILE_SIZE; // vector length of uchar
int kcn = (cn==2)?2:4;
int rstep = min(src_step, TILE_SIZE);
tileSize_height = min(TILE_SIZE, src_rows - y);
tileSize_width = min(TILE_SIZE, src_cols - x);
if( tileSize_width < TILE_SIZE )
for(int i = tileSize_width; i < rstep; i++ )
*((__global uchar*)src_data+(y+lidy)*src_step+x+i) = 0;
if( coi > 0 ) //channel of interest
for(int i = 0; i < tileSize_width; i += VLEN_C)
{
for(int j=0; j<VLEN_C; j++)
tmp_coi[j] = *((__global uchar*)src_data+(y+lidy)*src_step+(x+i+j)*kcn+coi-1);
tmp[i/VLEN_C] = (uchar16)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7],
tmp_coi[8],tmp_coi[9],tmp_coi[10],tmp_coi[11],tmp_coi[12],tmp_coi[13],tmp_coi[14],tmp_coi[15]);
}
else
for(int i=0; i < tileSize_width; i+=VLEN_C)
tmp[i/VLEN_C] = *(src_data+(y+lidy)*src_step/VLEN_C+(x+i)/VLEN_C);
uchar16 zero = (uchar16)(0);
uchar16 full = (uchar16)(255);
if( binary )
for(int i=0; i < tileSize_width; i+=VLEN_C)
tmp[i/VLEN_C] = (tmp[i/VLEN_C]!=zero)?full:zero;
double mom[10];
__local int m[10][128];
if(lidy == 0)
for(int i=0; i<10; i++)
for(int j=0; j<128; j++)
m[i][j]=0;
barrier(CLK_LOCAL_MEM_FENCE);
int lm[10] = {0};
int16 x0 = (int16)(0);
int16 x1 = (int16)(0);
int16 x2 = (int16)(0);
int16 x3 = (int16)(0);
for( int xt = 0 ; xt < tileSize_width; xt+=(VLEN_C) )
{
int16 v_xt = (int16)(xt, xt+1, xt+2, xt+3, xt+4, xt+5, xt+6, xt+7, xt+8, xt+9, xt+10, xt+11, xt+12, xt+13, xt+14, xt+15);
int16 p = convert_int16(tmp[xt/VLEN_C]);
int16 xp = v_xt * p, xxp = xp *v_xt;
x0 += p;
x1 += xp;
x2 += xxp;
x3 += xxp * v_xt;
}
x0.s0 += x0.s1 + x0.s2 + x0.s3 + x0.s4 + x0.s5 + x0.s6 + x0.s7 + x0.s8 + x0.s9 + x0.sa + x0.sb + x0.sc + x0.sd + x0.se + x0.sf;
x1.s0 += x1.s1 + x1.s2 + x1.s3 + x1.s4 + x1.s5 + x1.s6 + x1.s7 + x1.s8 + x1.s9 + x1.sa + x1.sb + x1.sc + x1.sd + x1.se + x1.sf;
x2.s0 += x2.s1 + x2.s2 + x2.s3 + x2.s4 + x2.s5 + x2.s6 + x2.s7 + x2.s8 + x2.s9 + x2.sa + x2.sb + x2.sc + x2.sd + x2.se + x2.sf;
x3.s0 += x3.s1 + x3.s2 + x3.s3 + x3.s4 + x3.s5 + x3.s6 + x3.s7 + x3.s8 + x3.s9 + x3.sa + x3.sb + x3.sc + x3.sd + x3.se + x3.sf;
int py = lidy * ((int)x0.s0);
int sy = lidy*lidy;
int bheight = min(tileSize_height, TILE_SIZE/2);
if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
{
m[9][lidy-bheight] = ((int)py) * sy; // m03
m[8][lidy-bheight] = ((int)x1.s0) * sy; // m12
m[7][lidy-bheight] = ((int)x2.s0) * lidy; // m21
m[6][lidy-bheight] = x3.s0; // m30
m[5][lidy-bheight] = x0.s0 * sy; // m02
m[4][lidy-bheight] = x1.s0 * lidy; // m11
m[3][lidy-bheight] = x2.s0; // m20
m[2][lidy-bheight] = py; // m01
m[1][lidy-bheight] = x1.s0; // m10
m[0][lidy-bheight] = x0.s0; // m00
}
else if(lidy < bheight)
{
lm[9] = ((int)py) * sy; // m03
lm[8] = ((int)x1.s0) * sy; // m12
lm[7] = ((int)x2.s0) * lidy; // m21
lm[6] = x3.s0; // m30
lm[5] = x0.s0 * sy; // m02
lm[4] = x1.s0 * lidy; // m11
lm[3] = x2.s0; // m20
lm[2] = py; // m01
lm[1] = x1.s0; // m10
lm[0] = x0.s0; // m00
}
barrier(CLK_LOCAL_MEM_FENCE);
for( int j = bheight; j >= 1; j = j/2 )
{
if(lidy < j)
for( int i = 0; i < 10; i++ )
lm[i] = lm[i] + m[i][lidy];
barrier(CLK_LOCAL_MEM_FENCE);
if(lidy >= j/2&&lidy < j)
for( int i = 0; i < 10; i++ )
m[i][lidy-j/2] = lm[i];
barrier(CLK_LOCAL_MEM_FENCE);
}
if(lidy == 0&&lidx == 0)
{
for( int mt = 0; mt < 10; mt++ )
mom[mt] = (double)lm[mt];
if(binary)
{
double s = 1./255;
for( int mt = 0; mt < 10; mt++ )
mom[mt] *= s;
}
double xm = x * mom[0], ym = y * mom[0];
// accumulate moments computed in each tile
// + m00 ( = m00' )
dst_m00[wgidy*dst_cols+wgidx] = mom[0];
// + m10 ( = m10' + x*m00' )
dst_m10[wgidy*dst_cols+wgidx] = mom[1] + xm;
// + m01 ( = m01' + y*m00' )
dst_m01[wgidy*dst_cols+wgidx] = mom[2] + ym;
// + m20 ( = m20' + 2*x*m10' + x*x*m00' )
dst_m20[wgidy*dst_cols+wgidx] = mom[3] + x * (mom[1] * 2 + xm);
// + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
dst_m11[wgidy*dst_cols+wgidx] = mom[4] + x * (mom[2] + ym) + y * mom[1];
// + m02 ( = m02' + 2*y*m01' + y*y*m00' )
dst_m02[wgidy*dst_cols+wgidx] = mom[5] + y * (mom[2] * 2 + ym);
// + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
dst_m30[wgidy*dst_cols+wgidx] = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
// + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
dst_m21[wgidy*dst_cols+wgidx] = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
// + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
dst_m12[wgidy*dst_cols+wgidx] = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
// + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
dst_m03[wgidy*dst_cols+wgidx] = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
}
}
//#endif
//#if defined (DOUBLE_SUPPORT)
__kernel void dst_sum(int src_rows, int src_cols, int tile_height, int tile_width, int TILE_SIZE, __global double* sum, __global double* dst_m00,
__global double* dst_m10,
__global double* dst_m01,
__global double* dst_m20,
__global double* dst_m11,
__global double* dst_m02,
__global double* dst_m30,
__global double* dst_m21,
__global double* dst_m12,
__global double* dst_m03)
{
int gidy = get_global_id(0);
int gidx = get_global_id(1);
int block_y = src_rows/tile_height;
int block_x = src_cols/tile_width;
int block_num;
if(src_rows > TILE_SIZE && src_rows % TILE_SIZE != 0)
block_y ++;
if(src_cols > TILE_SIZE && src_cols % TILE_SIZE != 0)
block_x ++;
block_num = block_y * block_x;
__local double dst_sum[10][128];
if(gidy<128-block_num)
for(int i=0; i<10; i++)
dst_sum[i][gidy+block_num]=0;
barrier(CLK_LOCAL_MEM_FENCE);
if(gidy<block_num)
{
dst_sum[0][gidy] = dst_m00[gidy];
dst_sum[1][gidy] = dst_m10[gidy];
dst_sum[2][gidy] = dst_m01[gidy];
dst_sum[3][gidy] = dst_m20[gidy];
dst_sum[4][gidy] = dst_m11[gidy];
dst_sum[5][gidy] = dst_m02[gidy];
dst_sum[6][gidy] = dst_m30[gidy];
dst_sum[7][gidy] = dst_m21[gidy];
dst_sum[8][gidy] = dst_m12[gidy];
dst_sum[9][gidy] = dst_m03[gidy];
}
barrier(CLK_LOCAL_MEM_FENCE);
for(int lsize=64; lsize>0; lsize>>=1)
{
if(gidy<lsize)
{
int lsize2 = gidy + lsize;
for(int i=0; i<10; i++)
dst_sum[i][gidy] += dst_sum[i][lsize2];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if(gidy==0)
for(int i=0; i<10; i++)
sum[i] = dst_sum[i][0];
}
//#endif
//#if defined (DOUBLE_SUPPORT)
__kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
__global double* dst_m00,
__global double* dst_m10,
__global double* dst_m01,
__global double* dst_m20,
__global double* dst_m11,
__global double* dst_m02,
__global double* dst_m30,
__global double* dst_m21,
__global double* dst_m12,
__global double* dst_m03,
int dst_cols, int dst_step,
int type, int depth, int cn, int coi, int binary, const int TILE_SIZE)
{
ushort tmp_coi[8]; // get the coi data
ushort8 tmp[32];
int VLEN_US = 8; // vector length of ushort
int gidy = get_global_id(0);
int gidx = get_global_id(1);
int wgidy = get_group_id(0);
int wgidx = get_group_id(1);
int lidy = get_local_id(0);
int lidx = get_local_id(1);
int y = wgidy*TILE_SIZE; // real Y index of pixel
int x = wgidx*TILE_SIZE; // real X index of pixel
int kcn = (cn==2)?2:4;
int rstep = min(src_step/2, TILE_SIZE);
tileSize_height = min(TILE_SIZE, src_rows - y);
tileSize_width = min(TILE_SIZE, src_cols -x);
if(src_cols > TILE_SIZE && tileSize_width < TILE_SIZE)
for(int i=tileSize_width; i < rstep; i++ )
*((__global ushort*)src_data+(y+lidy)*src_step/2+x+i) = 0;
if( coi > 0 )
for(int i=0; i < tileSize_width; i+=VLEN_US)
{
for(int j=0; j<VLEN_US; j++)
tmp_coi[j] = *((__global ushort*)src_data+(y+lidy)*(int)src_step/2+(x+i+j)*kcn+coi-1);
tmp[i/VLEN_US] = (ushort8)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]);
}
else
for(int i=0; i < tileSize_width; i+=VLEN_US)
tmp[i/VLEN_US] = *(src_data+(y+lidy)*src_step/(2*VLEN_US)+(x+i)/VLEN_US);
ushort8 zero = (ushort8)(0);
ushort8 full = (ushort8)(255);
if( binary )
for(int i=0; i < tileSize_width; i+=VLEN_US)
tmp[i/VLEN_US] = (tmp[i/VLEN_US]!=zero)?full:zero;
double mom[10];
__local long m[10][128];
if(lidy == 0)
for(int i=0; i<10; i++)
for(int j=0; j<128; j++)
m[i][j]=0;
barrier(CLK_LOCAL_MEM_FENCE);
long lm[10] = {0};
int8 x0 = (int8)(0);
int8 x1 = (int8)(0);
int8 x2 = (int8)(0);
long8 x3 = (long8)(0);
for( int xt = 0 ; xt < tileSize_width; xt+=(VLEN_US) )
{
int8 v_xt = (int8)(xt, xt+1, xt+2, xt+3, xt+4, xt+5, xt+6, xt+7);
int8 p = convert_int8(tmp[xt/VLEN_US]);
int8 xp = v_xt * p, xxp = xp * v_xt;
x0 += p;
x1 += xp;
x2 += xxp;
x3 += convert_long8(xxp) *convert_long8(v_xt);
}
x0.s0 += x0.s1 + x0.s2 + x0.s3 + x0.s4 + x0.s5 + x0.s6 + x0.s7;
x1.s0 += x1.s1 + x1.s2 + x1.s3 + x1.s4 + x1.s5 + x1.s6 + x1.s7;
x2.s0 += x2.s1 + x2.s2 + x2.s3 + x2.s4 + x2.s5 + x2.s6 + x2.s7;
x3.s0 += x3.s1 + x3.s2 + x3.s3 + x3.s4 + x3.s5 + x3.s6 + x3.s7;
int py = lidy * x0.s0, sy = lidy*lidy;
int bheight = min(tileSize_height, TILE_SIZE/2);
if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
{
m[9][lidy-bheight] = ((long)py) * sy; // m03
m[8][lidy-bheight] = ((long)x1.s0) * sy; // m12
m[7][lidy-bheight] = ((long)x2.s0) * lidy; // m21
m[6][lidy-bheight] = x3.s0; // m30
m[5][lidy-bheight] = x0.s0 * sy; // m02
m[4][lidy-bheight] = x1.s0 * lidy; // m11
m[3][lidy-bheight] = x2.s0; // m20
m[2][lidy-bheight] = py; // m01
m[1][lidy-bheight] = x1.s0; // m10
m[0][lidy-bheight] = x0.s0; // m00
}
else if(lidy < bheight)
{
lm[9] = ((long)py) * sy; // m03
lm[8] = ((long)x1.s0) * sy; // m12
lm[7] = ((long)x2.s0) * lidy; // m21
lm[6] = x3.s0; // m30
lm[5] = x0.s0 * sy; // m02
lm[4] = x1.s0 * lidy; // m11
lm[3] = x2.s0; // m20
lm[2] = py; // m01
lm[1] = x1.s0; // m10
lm[0] = x0.s0; // m00
}
barrier(CLK_LOCAL_MEM_FENCE);
for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
{
if(lidy < j)
for( int i = 0; i < 10; i++ )
lm[i] = lm[i] + m[i][lidy];
barrier(CLK_LOCAL_MEM_FENCE);
if(lidy >= j/2&&lidy < j)
for( int i = 0; i < 10; i++ )
m[i][lidy-j/2] = lm[i];
barrier(CLK_LOCAL_MEM_FENCE);
}
if(lidy == 0&&lidx == 0)
{
for(int mt = 0; mt < 10; mt++ )
mom[mt] = (double)lm[mt];
if(binary)
{
double s = 1./255;
for( int mt = 0; mt < 10; mt++ )
mom[mt] *= s;
}
double xm = x *mom[0], ym = y * mom[0];
// accumulate moments computed in each tile
// + m00 ( = m00' )
dst_m00[wgidy*dst_cols+wgidx] = mom[0];
// + m10 ( = m10' + x*m00' )
dst_m10[wgidy*dst_cols+wgidx] = mom[1] + xm;
// + m01 ( = m01' + y*m00' )
dst_m01[wgidy*dst_cols+wgidx] = mom[2] + ym;
// + m20 ( = m20' + 2*x*m10' + x*x*m00' )
dst_m20[wgidy*dst_cols+wgidx] = mom[3] + x * (mom[1] * 2 + xm);
// + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
dst_m11[wgidy*dst_cols+wgidx] = mom[4] + x * (mom[2] + ym) + y * mom[1];
// + m02 ( = m02' + 2*y*m01' + y*y*m00' )
dst_m02[wgidy*dst_cols+wgidx] = mom[5] + y * (mom[2] * 2 + ym);
// + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
dst_m30[wgidy*dst_cols+wgidx] = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
// + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
dst_m21[wgidy*dst_cols+wgidx] = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
// + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
dst_m12[wgidy*dst_cols+wgidx] = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
// + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
dst_m03[wgidy*dst_cols+wgidx] = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
}
}
//#endif
//#if defined (DOUBLE_SUPPORT)
__kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
__global double* dst_m00,
__global double* dst_m10,
__global double* dst_m01,
__global double* dst_m20,
__global double* dst_m11,
__global double* dst_m02,
__global double* dst_m30,
__global double* dst_m21,
__global double* dst_m12,
__global double* dst_m03,
int dst_cols, int dst_step,
int type, int depth, int cn, int coi, int binary, const int TILE_SIZE)
{
short tmp_coi[8]; // get the coi data
short8 tmp[32];
int VLEN_S =8; // vector length of short
int gidy = get_global_id(0);
int gidx = get_global_id(1);
int wgidy = get_group_id(0);
int wgidx = get_group_id(1);
int lidy = get_local_id(0);
int lidx = get_local_id(1);
int y = wgidy*TILE_SIZE; // real Y index of pixel
int x = wgidx*TILE_SIZE; // real X index of pixel
int kcn = (cn==2)?2:4;
int rstep = min(src_step/2, TILE_SIZE);
tileSize_height = min(TILE_SIZE, src_rows - y);
tileSize_width = min(TILE_SIZE, src_cols -x);
if(tileSize_width < TILE_SIZE)
for(int i = tileSize_width; i < rstep; i++ )
*((__global short*)src_data+(y+lidy)*src_step/2+x+i) = 0;
if( coi > 0 )
for(int i=0; i < tileSize_width; i+=VLEN_S)
{
for(int j=0; j<VLEN_S; j++)
tmp_coi[j] = *((__global short*)src_data+(y+lidy)*src_step/2+(x+i+j)*kcn+coi-1);
tmp[i/VLEN_S] = (short8)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]);
}
else
for(int i=0; i < tileSize_width; i+=VLEN_S)
tmp[i/VLEN_S] = *(src_data+(y+lidy)*src_step/(2*VLEN_S)+(x+i)/VLEN_S);
short8 zero = (short8)(0);
short8 full = (short8)(255);
if( binary )
for(int i=0; i < tileSize_width; i+=(VLEN_S))
tmp[i/VLEN_S] = (tmp[i/VLEN_S]!=zero)?full:zero;
double mom[10];
__local long m[10][128];
if(lidy == 0)
for(int i=0; i<10; i++)
for(int j=0; j<128; j++)
m[i][j]=0;
barrier(CLK_LOCAL_MEM_FENCE);
long lm[10] = {0};
int8 x0 = (int8)(0);
int8 x1 = (int8)(0);
int8 x2 = (int8)(0);
long8 x3 = (long8)(0);
for( int xt = 0 ; xt < tileSize_width; xt+= (VLEN_S))
{
int8 v_xt = (int8)(xt, xt+1, xt+2, xt+3, xt+4, xt+5, xt+6, xt+7);
int8 p = convert_int8(tmp[xt/VLEN_S]);
int8 xp = v_xt * p, xxp = xp * v_xt;
x0 += p;
x1 += xp;
x2 += xxp;
x3 += convert_long8(xxp) * convert_long8(v_xt);
}
x0.s0 += x0.s1 + x0.s2 + x0.s3 + x0.s4 + x0.s5 + x0.s6 + x0.s7;
x1.s0 += x1.s1 + x1.s2 + x1.s3 + x1.s4 + x1.s5 + x1.s6 + x1.s7;
x2.s0 += x2.s1 + x2.s2 + x2.s3 + x2.s4 + x2.s5 + x2.s6 + x2.s7;
x3.s0 += x3.s1 + x3.s2 + x3.s3 + x3.s4 + x3.s5 + x3.s6 + x3.s7;
int py = lidy * x0.s0, sy = lidy*lidy;
int bheight = min(tileSize_height, TILE_SIZE/2);
if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
{
m[9][lidy-bheight] = ((long)py) * sy; // m03
m[8][lidy-bheight] = ((long)x1.s0) * sy; // m12
m[7][lidy-bheight] = ((long)x2.s0) * lidy; // m21
m[6][lidy-bheight] = x3.s0; // m30
m[5][lidy-bheight] = x0.s0 * sy; // m02
m[4][lidy-bheight] = x1.s0 * lidy; // m11
m[3][lidy-bheight] = x2.s0; // m20
m[2][lidy-bheight] = py; // m01
m[1][lidy-bheight] = x1.s0; // m10
m[0][lidy-bheight] = x0.s0; // m00
}
else if(lidy < bheight)
{
lm[9] = ((long)py) * sy; // m03
lm[8] = ((long)(x1.s0)) * sy; // m12
lm[7] = ((long)(x2.s0)) * lidy; // m21
lm[6] = x3.s0; // m30
lm[5] = x0.s0 * sy; // m02
lm[4] = x1.s0 * lidy; // m11
lm[3] = x2.s0; // m20
lm[2] = py; // m01
lm[1] = x1.s0; // m10
lm[0] = x0.s0; // m00
}
barrier(CLK_LOCAL_MEM_FENCE);
for( int j = TILE_SIZE/2; j >=1; j = j/2 )
{
if(lidy < j)
for( int i = 0; i < 10; i++ )
lm[i] = lm[i] + m[i][lidy];
barrier(CLK_LOCAL_MEM_FENCE);
if(lidy >= j/2&&lidy < j)
for( int i = 0; i < 10; i++ )
m[i][lidy-j/2] = lm[i];
barrier(CLK_LOCAL_MEM_FENCE);
}
if(lidy ==0 &&lidx ==0)
{
for(int mt = 0; mt < 10; mt++ )
mom[mt] = (double)lm[mt];
if(binary)
{
double s = 1./255;
for( int mt = 0; mt < 10; mt++ )
mom[mt] *= s;
}
double xm = x * mom[0], ym = y*mom[0];
// accumulate moments computed in each tile
// + m00 ( = m00' )
dst_m00[wgidy*dst_cols+wgidx] = mom[0];
// + m10 ( = m10' + x*m00' )
dst_m10[wgidy*dst_cols+wgidx] = mom[1] + xm;
// + m01 ( = m01' + y*m00' )
dst_m01[wgidy*dst_cols+wgidx] = mom[2] + ym;
// + m20 ( = m20' + 2*x*m10' + x*x*m00' )
dst_m20[wgidy*dst_cols+wgidx] = mom[3] + x * (mom[1] * 2 + xm);
// + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
dst_m11[wgidy*dst_cols+wgidx] = mom[4] + x * (mom[2] + ym) + y * mom[1];
// + m02 ( = m02' + 2*y*m01' + y*y*m00' )
dst_m02[wgidy*dst_cols+wgidx] = mom[5] + y * (mom[2] * 2 + ym);
// + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
dst_m30[wgidy*dst_cols+wgidx] = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
// + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
dst_m21[wgidy*dst_cols+wgidx] = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
// + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
dst_m12[wgidy*dst_cols+wgidx] = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
// + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
dst_m03[wgidy*dst_cols+wgidx] = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
}
}
//#endif
//#if defined (DOUBLE_SUPPORT)
__kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
__global double* dst_m00,
__global double* dst_m10,
__global double* dst_m01,
__global double* dst_m20,
__global double* dst_m11,
__global double* dst_m02,
__global double* dst_m30,
__global double* dst_m21,
__global double* dst_m12,
__global double* dst_m03,
int dst_cols, int dst_step,
int type, int depth, int cn, int coi, int binary, const int TILE_SIZE)
{
float tmp_coi[4]; // get the coi data
float4 tmp[64] ;
int VLEN_F = 4; // vector length of float
int gidy = get_global_id(0);
int gidx = get_global_id(1);
int wgidy = get_group_id(0);
int wgidx = get_group_id(1);
int lidy = get_local_id(0);
int lidx = get_local_id(1);
int y = wgidy*TILE_SIZE; // real Y index of pixel
int x = wgidx*TILE_SIZE; // real X index of pixel
int kcn = (cn==2)?2:4;
int rstep = min(src_step/4, TILE_SIZE);
tileSize_height = min(TILE_SIZE, src_rows - y);
tileSize_width = min(TILE_SIZE, src_cols -x);
if(tileSize_width < TILE_SIZE)
for(int i = tileSize_width; i < rstep; i++ )
*((__global float*)src_data+(y+lidy)*src_step/4+x+i) = 0;
if( coi > 0 )
for(int i=0; i < tileSize_width; i+=VLEN_F)
{
for(int j=0; j<4; j++)
tmp_coi[j] = *(src_data+(y+lidy)*src_step/4+(x+i+j)*kcn+coi-1);
tmp[i/VLEN_F] = (float4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]);
}
else
for(int i=0; i < tileSize_width; i+=VLEN_F)
tmp[i/VLEN_F] = (float4)(*(src_data+(y+lidy)*src_step/4+x+i),*(src_data+(y+lidy)*src_step/4+x+i+1),*(src_data+(y+lidy)*src_step/4+x+i+2),*(src_data+(y+lidy)*src_step/4+x+i+3));
float4 zero = (float4)(0);
float4 full = (float4)(255);
if( binary )
for(int i=0; i < tileSize_width; i+=4)
tmp[i/VLEN_F] = (tmp[i/VLEN_F]!=zero)?full:zero;
double mom[10];
__local double m[10][128];
if(lidy == 0)
for(int i = 0; i < 10; i ++)
for(int j = 0; j < 128; j ++)
m[i][j] = 0;
barrier(CLK_LOCAL_MEM_FENCE);
double lm[10] = {0};
double4 x0 = (double4)(0);
double4 x1 = (double4)(0);
double4 x2 = (double4)(0);
double4 x3 = (double4)(0);
for( int xt = 0 ; xt < tileSize_width; xt+=VLEN_F )
{
double4 v_xt = (double4)(xt, xt+1, xt+2, xt+3);
double4 p = convert_double4(tmp[xt/VLEN_F]);
double4 xp = v_xt * p, xxp = xp * v_xt;
x0 += p;
x1 += xp;
x2 += xxp;
x3 += xxp * v_xt;
}
x0.s0 += x0.s1 + x0.s2 + x0.s3;
x1.s0 += x1.s1 + x1.s2 + x1.s3;
x2.s0 += x2.s1 + x2.s2 + x2.s3;
x3.s0 += x3.s1 + x3.s2 + x3.s3;
/*
double py = lidy * x0.s0, sy = lidy*lidy;
int bheight = min(tileSize_height, TILE_SIZE/2);
if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
{
m[9][lidy-bheight] = ((double)py) * sy; // m03
m[8][lidy-bheight] = ((double)x1.s0) * sy; // m12
m[7][lidy-bheight] = ((double)x2.s0) * lidy; // m21
m[6][lidy-bheight] = x3.s0; // m30
m[5][lidy-bheight] = x0.s0 * sy; // m02
m[4][lidy-bheight] = x1.s0 * lidy; // m11
m[3][lidy-bheight] = x2.s0; // m20
m[2][lidy-bheight] = py; // m01
m[1][lidy-bheight] = x1.s0; // m10
m[0][lidy-bheight] = x0.s0; // m00
}
else if(lidy < bheight)
{
lm[9] = ((double)py) * sy; // m03
lm[8] = ((double)x1.s0) * sy; // m12
lm[7] = ((double)x2.s0) * lidy; // m21
lm[6] = x3.s0; // m30
lm[5] = x0.s0 * sy; // m02
lm[4] = x1.s0 * lidy; // m11
lm[3] = x2.s0; // m20
lm[2] = py; // m01
lm[1] = x1.s0; // m10
lm[0] = x0.s0; // m00
}
barrier(CLK_LOCAL_MEM_FENCE);
for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
{
if(lidy < j)
for( int i = 0; i < 10; i++ )
lm[i] = lm[i] + m[i][lidy];
barrier(CLK_LOCAL_MEM_FENCE);
if(lidy >= j/2&&lidy < j)
for( int i = 0; i < 10; i++ )
m[i][lidy-j/2] = lm[i];
barrier(CLK_LOCAL_MEM_FENCE);
}
if(lidy == 0&&lidx == 0)
{
for(int mt = 0; mt < 10; mt++ )
mom[mt] = (double)lm[mt];
if(binary)
{
double s = 1./255;
for( int mt = 0; mt < 10; mt++ )
mom[mt] *= s;
}
double xm = x * mom[0], ym = y * mom[0];
// accumulate moments computed in each tile
// + m00 ( = m00' )
dst_m00[wgidy*dst_cols+wgidx]= mom[0];
// + m10 ( = m10' + x*m00' )
dst_m10[wgidy*dst_cols+wgidx] = mom[1] + xm;
// + m01 ( = m01' + y*m00' )
dst_m01[wgidy*dst_cols+wgidx] = mom[2] + ym;
// + m20 ( = m20' + 2*x*m10' + x*x*m00' )
dst_m20[wgidy*dst_cols+wgidx] = mom[3] + x * (mom[1] * 2 + xm);
// + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
dst_m11[wgidy*dst_cols+wgidx] = mom[4] + x * (mom[2] + ym) + y * mom[1];
// + m02 ( = m02' + 2*y*m01' + y*y*m00' )
dst_m02[wgidy*dst_cols+wgidx]= mom[5] + y * (mom[2] * 2 + ym);
// + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
dst_m30[wgidy*dst_cols+wgidx]= mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
// + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
dst_m21[wgidy*dst_cols+wgidx] = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
// + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
dst_m12[wgidy*dst_cols+wgidx] = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
// + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
dst_m03[wgidy*dst_cols+wgidx]= mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
}*/
}
//#endif
//#if defined (DOUBLE_SUPPORT)
__kernel void CvMoments_D6(__global double* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
__global double* dst_m00,
__global double* dst_m10,
__global double* dst_m01,
__global double* dst_m20,
__global double* dst_m11,
__global double* dst_m02,
__global double* dst_m30,
__global double* dst_m21,
__global double* dst_m12,
__global double* dst_m03,
int dst_cols, int dst_step,
int type, int depth, int cn, int coi, int binary, const int TILE_SIZE)
{
double tmp_coi[4]; // get the coi data
double4 tmp[64];
int VLEN_D = 4; // length of vetor
int gidy = get_global_id(0);
int gidx = get_global_id(1);
int wgidy = get_group_id(0);
int wgidx = get_group_id(1);
int lidy = get_local_id(0);
int lidx = get_local_id(1);
int y = wgidy*TILE_SIZE; // real Y index of pixel
int x = wgidx*TILE_SIZE; // real X index of pixel
int kcn = (cn==2)?2:4;
int rstep = min(src_step/8, TILE_SIZE);
tileSize_height = min(TILE_SIZE, src_rows - y);
tileSize_width = min(TILE_SIZE, src_cols - x);
if(tileSize_width < TILE_SIZE)
for(int i = tileSize_width; i < rstep; i++ )
*((__global double*)src_data+(y+lidy)*src_step/8+x+i) = 0;
if( coi > 0 )
for(int i=0; i < tileSize_width; i+=VLEN_D)
{
for(int j=0; j<4; j++)
tmp_coi[j] = *(src_data+(y+lidy)*src_step/8+(x+i+j)*kcn+coi-1);
tmp[i/VLEN_D] = (double4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]);
}
else
for(int i=0; i < tileSize_width; i+=VLEN_D)
tmp[i/VLEN_D] = (double4)(*(src_data+(y+lidy)*src_step/8+x+i),*(src_data+(y+lidy)*src_step/8+x+i+1),*(src_data+(y+lidy)*src_step/8+x+i+2),*(src_data+(y+lidy)*src_step/8+x+i+3));
double4 zero = (double4)(0);
double4 full = (double4)(255);
if( binary )
for(int i=0; i < tileSize_width; i+=VLEN_D)
tmp[i/VLEN_D] = (tmp[i/VLEN_D]!=zero)?full:zero;
double mom[10];
__local double m[10][128];
if(lidy == 0)
for(int i=0; i<10; i++)
for(int j=0; j<128; j++)
m[i][j]=0;
barrier(CLK_LOCAL_MEM_FENCE);
double lm[10] = {0};
double4 x0 = (double4)(0);
double4 x1 = (double4)(0);
double4 x2 = (double4)(0);
double4 x3 = (double4)(0);
for( int xt = 0 ; xt < tileSize_width; xt+=VLEN_D )
{
double4 v_xt = (double4)(xt, xt+1, xt+2, xt+3);
double4 p = tmp[xt/VLEN_D];
double4 xp = v_xt * p, xxp = xp * v_xt;
x0 += p;
x1 += xp;
x2 += xxp;
x3 += xxp *v_xt;
}
x0.s0 += x0.s1 + x0.s2 + x0.s3;
x1.s0 += x1.s1 + x1.s2 + x1.s3;
x2.s0 += x2.s1 + x2.s2 + x2.s3;
x3.s0 += x3.s1 + x3.s2 + x3.s3;
double py = lidy * x0.s0, sy = lidy*lidy;
int bheight = min(tileSize_height, TILE_SIZE/2);
if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
{
m[9][lidy-bheight] = ((double)py) * sy; // m03
m[8][lidy-bheight] = ((double)x1.s0) * sy; // m12
m[7][lidy-bheight] = ((double)x2.s0) * lidy; // m21
m[6][lidy-bheight] = x3.s0; // m30
m[5][lidy-bheight] = x0.s0 * sy; // m02
m[4][lidy-bheight] = x1.s0 * lidy; // m11
m[3][lidy-bheight] = x2.s0; // m20
m[2][lidy-bheight] = py; // m01
m[1][lidy-bheight] = x1.s0; // m10
m[0][lidy-bheight] = x0.s0; // m00
}
else if(lidy < bheight)
{
lm[9] = ((double)py) * sy; // m03
lm[8] = ((double)x1.s0) * sy; // m12
lm[7] = ((double)x2.s0) * lidy; // m21
lm[6] = x3.s0; // m30
lm[5] = x0.s0 * sy; // m02
lm[4] = x1.s0 * lidy; // m11
lm[3] = x2.s0; // m20
lm[2] = py; // m01
lm[1] = x1.s0; // m10
lm[0] = x0.s0; // m00
}
barrier(CLK_LOCAL_MEM_FENCE);
for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
{
if(lidy < j)
for( int i = 0; i < 10; i++ )
lm[i] = lm[i] + m[i][lidy];
barrier(CLK_LOCAL_MEM_FENCE);
if(lidy >= j/2&&lidy < j)
for( int i = 0; i < 10; i++ )
m[i][lidy-j/2] = lm[i];
barrier(CLK_LOCAL_MEM_FENCE);
}
if(lidy == 0&&lidx == 0)
{
for( int mt = 0; mt < 10; mt++ )
mom[mt] = (double)lm[mt];
if(binary)
{
double s = 1./255;
for( int mt = 0; mt < 10; mt++ )
mom[mt] *= s;
}
double xm = x * mom[0], ym = y * mom[0];
// accumulate moments computed in each tile
// + m00 ( = m00' )
dst_m00[wgidy*dst_cols+wgidx] = mom[0];
// + m10 ( = m10' + x*m00' )
dst_m10[wgidy*dst_cols+wgidx] = mom[1] + xm;
// + m01 ( = m01' + y*m00' )
dst_m01[wgidy*dst_cols+wgidx] = mom[2] + ym;
// + m20 ( = m20' + 2*x*m10' + x*x*m00' )
dst_m20[wgidy*dst_cols+wgidx] = mom[3] + x * (mom[1] * 2 + xm);
// + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
dst_m11[wgidy*dst_cols+wgidx] = mom[4] + x * (mom[2] + ym) + y * mom[1];
// + m02 ( = m02' + 2*y*m01' + y*y*m00' )
dst_m02[wgidy*dst_cols+wgidx] = mom[5] + y * (mom[2] * 2 + ym);
// + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
dst_m30[wgidy*dst_cols+wgidx] = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
// + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
dst_m21[wgidy*dst_cols+wgidx] = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
// + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
dst_m12[wgidy*dst_cols+wgidx] = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
// + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
dst_m03[wgidy*dst_cols+wgidx] = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
}
}
//#endif

View File

@@ -43,10 +43,39 @@
//
//M*/
#pragma OPENCL EXTENSION cl_amd_printf : enable
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
// specialized for non-image2d_t supported platform, intel HD4000, for example
#ifdef DISABLE_IMAGE2D
#define IMAGE_INT32 __global uint *
#define IMAGE_INT8 __global uchar *
#else
#define IMAGE_INT32 image2d_t
#define IMAGE_INT8 image2d_t
#endif
uint read_sumTex(IMAGE_INT32 img, sampler_t sam, int2 coord, int rows, int cols, int elemPerRow)
{
#ifdef DISABLE_IMAGE2D
int x = clamp(coord.x, 0, cols);
int y = clamp(coord.y, 0, rows);
return img[elemPerRow * y + x];
#else
return read_imageui(img, sam, coord).x;
#endif
}
uchar read_imgTex(IMAGE_INT8 img, sampler_t sam, float2 coord, int rows, int cols, int elemPerRow)
{
#ifdef DISABLE_IMAGE2D
int x = clamp(convert_int_rte(coord.x), 0, cols - 1);
int y = clamp(convert_int_rte(coord.y), 0, rows - 1);
return img[elemPerRow * y + x];
#else
return (uchar)read_imageui(img, sam, coord).x;
#endif
}
// dynamically change the precision used for floating type
#if defined (__ATI__) || defined (__NVIDIA__)
@@ -58,14 +87,24 @@
// Image read mode
__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
#ifndef FLT_EPSILON
#define FLT_EPSILON (1e-15)
#define CV_PI_F 3.14159265f
#endif
#ifndef CV_PI_F
#define CV_PI_F 3.14159265f
#endif
// Use integral image to calculate haar wavelets.
// N = 2
// for simple haar paatern
float icvCalcHaarPatternSum_2(image2d_t sumTex, __constant float src[2][5], int oldSize, int newSize, int y, int x)
float icvCalcHaarPatternSum_2(
IMAGE_INT32 sumTex,
__constant float src[2][5],
int oldSize,
int newSize,
int y, int x,
int rows, int cols, int elemPerRow)
{
float ratio = (float)newSize / oldSize;
@@ -81,11 +120,10 @@ float icvCalcHaarPatternSum_2(image2d_t sumTex, __constant float src[2][5], int
int dy2 = convert_int_rte(ratio * src[k][3]);
F t = 0;
t += read_imageui(sumTex, sampler, (int2)(x + dx1, y + dy1)).x;
t -= read_imageui(sumTex, sampler, (int2)(x + dx1, y + dy2)).x;
t -= read_imageui(sumTex, sampler, (int2)(x + dx2, y + dy1)).x;
t += read_imageui(sumTex, sampler, (int2)(x + dx2, y + dy2)).x;
t += read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy1), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy2), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy1), rows, cols, elemPerRow );
t += read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy2), rows, cols, elemPerRow );
d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1));
}
@@ -93,7 +131,13 @@ float icvCalcHaarPatternSum_2(image2d_t sumTex, __constant float src[2][5], int
}
// N = 3
float icvCalcHaarPatternSum_3(image2d_t sumTex, __constant float src[3][5], int oldSize, int newSize, int y, int x)
float icvCalcHaarPatternSum_3(
IMAGE_INT32 sumTex,
__constant float src[2][5],
int oldSize,
int newSize,
int y, int x,
int rows, int cols, int elemPerRow)
{
float ratio = (float)newSize / oldSize;
@@ -109,11 +153,10 @@ float icvCalcHaarPatternSum_3(image2d_t sumTex, __constant float src[3][5], int
int dy2 = convert_int_rte(ratio * src[k][3]);
F t = 0;
t += read_imageui(sumTex, sampler, (int2)(x + dx1, y + dy1)).x;
t -= read_imageui(sumTex, sampler, (int2)(x + dx1, y + dy2)).x;
t -= read_imageui(sumTex, sampler, (int2)(x + dx2, y + dy1)).x;
t += read_imageui(sumTex, sampler, (int2)(x + dx2, y + dy2)).x;
t += read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy1), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy2), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy1), rows, cols, elemPerRow );
t += read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy2), rows, cols, elemPerRow );
d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1));
}
@@ -121,7 +164,13 @@ float icvCalcHaarPatternSum_3(image2d_t sumTex, __constant float src[3][5], int
}
// N = 4
float icvCalcHaarPatternSum_4(image2d_t sumTex, __constant float src[4][5], int oldSize, int newSize, int y, int x)
float icvCalcHaarPatternSum_4(
IMAGE_INT32 sumTex,
__constant float src[2][5],
int oldSize,
int newSize,
int y, int x,
int rows, int cols, int elemPerRow)
{
float ratio = (float)newSize / oldSize;
@@ -137,11 +186,10 @@ float icvCalcHaarPatternSum_4(image2d_t sumTex, __constant float src[4][5], int
int dy2 = convert_int_rte(ratio * src[k][3]);
F t = 0;
t += read_imageui(sumTex, sampler, (int2)(x + dx1, y + dy1)).x;
t -= read_imageui(sumTex, sampler, (int2)(x + dx1, y + dy2)).x;
t -= read_imageui(sumTex, sampler, (int2)(x + dx2, y + dy1)).x;
t += read_imageui(sumTex, sampler, (int2)(x + dx2, y + dy2)).x;
t += read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy1), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy2), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy1), rows, cols, elemPerRow );
t += read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy2), rows, cols, elemPerRow );
d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1));
}
@@ -172,7 +220,7 @@ __inline int calcSize(int octave, int layer)
//calculate targeted layer per-pixel determinant and trace with an integral image
__kernel void icvCalcLayerDetAndTrace(
image2d_t sumTex, // input integral image
IMAGE_INT32 sumTex, // input integral image
__global float * det, // output Determinant
__global float * trace, // output trace
int det_step, // the step of det in bytes
@@ -181,11 +229,13 @@ __kernel void icvCalcLayerDetAndTrace(
int c_img_cols,
int c_nOctaveLayers,
int c_octave,
int c_layer_rows
int c_layer_rows,
int sumTex_step
)
{
det_step /= sizeof(*det);
trace_step /= sizeof(*trace);
sumTex_step/= sizeof(uint);
// Determine the indices
const int gridDim_y = get_num_groups(1) / (c_nOctaveLayers + 2);
const int blockIdx_y = get_group_id(1) % gridDim_y;
@@ -205,9 +255,9 @@ __kernel void icvCalcLayerDetAndTrace(
if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j)
{
const float dx = icvCalcHaarPatternSum_3(sumTex, c_DX , 9, size, i << c_octave, j << c_octave);
const float dy = icvCalcHaarPatternSum_3(sumTex, c_DY , 9, size, i << c_octave, j << c_octave);
const float dxy = icvCalcHaarPatternSum_4(sumTex, c_DXY, 9, size, i << c_octave, j << c_octave);
const float dx = icvCalcHaarPatternSum_3(sumTex, c_DX , 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step);
const float dy = icvCalcHaarPatternSum_3(sumTex, c_DY , 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step);
const float dxy = icvCalcHaarPatternSum_4(sumTex, c_DXY, 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step);
det [j + margin + det_step * (layer * c_layer_rows + i + margin)] = dx * dy - 0.81f * dxy * dxy;
trace[j + margin + trace_step * (layer * c_layer_rows + i + margin)] = dx + dy;
@@ -220,7 +270,7 @@ __kernel void icvCalcLayerDetAndTrace(
__constant float c_DM[5] = {0, 0, 9, 9, 1};
bool within_check(image2d_t maskSumTex, int sum_i, int sum_j, int size)
bool within_check(IMAGE_INT32 maskSumTex, int sum_i, int sum_j, int size, int rows, int cols, int step)
{
float ratio = (float)size / 9.0f;
@@ -233,10 +283,10 @@ bool within_check(image2d_t maskSumTex, int sum_i, int sum_j, int size)
float t = 0;
t += read_imageui(maskSumTex, sampler, (int2)(sum_j + dx1, sum_i + dy1)).x;
t -= read_imageui(maskSumTex, sampler, (int2)(sum_j + dx1, sum_i + dy2)).x;
t -= read_imageui(maskSumTex, sampler, (int2)(sum_j + dx2, sum_i + dy1)).x;
t += read_imageui(maskSumTex, sampler, (int2)(sum_j + dx2, sum_i + dy2)).x;
t += read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx1, sum_i + dy1), rows, cols, step);
t -= read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx1, sum_i + dy2), rows, cols, step);
t -= read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx2, sum_i + dy1), rows, cols, step);
t += read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx2, sum_i + dy2), rows, cols, step);
d += t * c_DM[4] / ((dx2 - dx1) * (dy2 - dy1));
@@ -261,7 +311,8 @@ __kernel
int c_layer_cols,
int c_max_candidates,
float c_hessianThreshold,
image2d_t maskSumTex
IMAGE_INT32 maskSumTex,
int mask_step
)
{
volatile __local float N9[768]; // threads.x * threads.y * 3
@@ -269,6 +320,7 @@ __kernel
det_step /= sizeof(*det);
trace_step /= sizeof(*trace);
maxCounter += counter_offset;
mask_step /= sizeof(uint);
// Determine the indices
const int gridDim_y = get_num_groups(1) / c_nOctaveLayers;
@@ -321,7 +373,7 @@ __kernel
const int sum_i = (i - ((size >> 1) >> c_octave)) << c_octave;
const int sum_j = (j - ((size >> 1) >> c_octave)) << c_octave;
if (within_check(maskSumTex, sum_i, sum_j, size))
if (within_check(maskSumTex, sum_i, sum_j, size, c_img_rows, c_img_cols, mask_step))
{
// Check to see if we have a max (in its 26 neighbours)
const bool condmax = val0 > N9[localLin - 1 - get_local_size(0) - zoff]
@@ -704,14 +756,16 @@ void reduce_32_sum(volatile __local float * data, float partial_reduction, int
__kernel
void icvCalcOrientation(
image2d_t sumTex,
IMAGE_INT32 sumTex,
__global float * keypoints,
int keypoints_step,
int c_img_rows,
int c_img_cols
int c_img_cols,
int sum_step
)
{
keypoints_step /= sizeof(*keypoints);
sum_step /= sizeof(uint);
__global float* featureX = keypoints + X_ROW * keypoints_step;
__global float* featureY = keypoints + Y_ROW * keypoints_step;
__global float* featureSize = keypoints + SIZE_ROW * keypoints_step;
@@ -754,8 +808,8 @@ __kernel
if (y >= 0 && y < (c_img_rows + 1) - grad_wav_size &&
x >= 0 && x < (c_img_cols + 1) - grad_wav_size)
{
X = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NX, 4, grad_wav_size, y, x);
Y = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NY, 4, grad_wav_size, y, x);
X = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NX, 4, grad_wav_size, y, x, c_img_rows, c_img_cols, sum_step);
Y = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NY, 4, grad_wav_size, y, x, c_img_rows, c_img_cols, sum_step);
angle = atan2(Y, X);
@@ -881,20 +935,20 @@ __constant float c_DW[PATCH_SZ * PATCH_SZ] =
// utility for linear filter
inline uchar readerGet(
image2d_t src,
IMAGE_INT8 src,
const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
int i, int j
int i, int j, int rows, int cols, int elemPerRow
)
{
float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;
float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir;
return (uchar)read_imageui(src, sampler, (float2)(pixel_x, pixel_y)).x;
return read_imgTex(src, sampler, (float2)(pixel_x, pixel_y), rows, cols, elemPerRow);
}
inline float linearFilter(
image2d_t src,
IMAGE_INT8 src,
const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
float y, float x
float y, float x, int rows, int cols, int elemPerRow
)
{
x -= 0.5f;
@@ -907,30 +961,33 @@ inline float linearFilter(
const int x2 = x1 + 1;
const int y2 = y1 + 1;
uchar src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y1, x1);
uchar src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y1, x1, rows, cols, elemPerRow);
out = out + src_reg * ((x2 - x) * (y2 - y));
src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y1, x2);
src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y1, x2, rows, cols, elemPerRow);
out = out + src_reg * ((x - x1) * (y2 - y));
src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y2, x1);
src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y2, x1, rows, cols, elemPerRow);
out = out + src_reg * ((x2 - x) * (y - y1));
src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y2, x2);
src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y2, x2, rows, cols, elemPerRow);
out = out + src_reg * ((x - x1) * (y - y1));
return out;
}
void calc_dx_dy(
image2d_t imgTex,
IMAGE_INT8 imgTex,
volatile __local float s_dx_bin[25],
volatile __local float s_dy_bin[25],
volatile __local float s_PATCH[6][6],
__global const float* featureX,
__global const float* featureY,
__global const float* featureSize,
__global const float* featureDir
__global const float* featureDir,
int rows,
int cols,
int elemPerRow
)
{
const float centerX = featureX[get_group_id(0)];
@@ -965,7 +1022,7 @@ void calc_dx_dy(
const float icoo = ((float)yIndex / (PATCH_SZ + 1)) * win_size;
const float jcoo = ((float)xIndex / (PATCH_SZ + 1)) * win_size;
s_PATCH[get_local_id(1)][get_local_id(0)] = linearFilter(imgTex, centerX, centerY, win_offset, cos_dir, sin_dir, icoo, jcoo);
s_PATCH[get_local_id(1)][get_local_id(0)] = linearFilter(imgTex, centerX, centerY, win_offset, cos_dir, sin_dir, icoo, jcoo, rows, cols, elemPerRow);
barrier(CLK_LOCAL_MEM_FENCE);
@@ -1035,16 +1092,18 @@ void reduce_sum25(
__kernel
void compute_descriptors64(
image2d_t imgTex,
IMAGE_INT8 imgTex,
volatile __global float * descriptors,
__global const float * keypoints,
int descriptors_step,
int keypoints_step
int keypoints_step,
int rows,
int cols,
int img_step
)
{
descriptors_step /= sizeof(float);
keypoints_step /= sizeof(float);
__global const float * featureX = keypoints + X_ROW * keypoints_step;
__global const float * featureY = keypoints + Y_ROW * keypoints_step;
__global const float * featureSize = keypoints + SIZE_ROW * keypoints_step;
@@ -1057,7 +1116,7 @@ __kernel
volatile __local float sdyabs[25];
volatile __local float s_PATCH[6][6];
calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir);
calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir, rows, cols, img_step);
barrier(CLK_LOCAL_MEM_FENCE);
const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0);
@@ -1066,10 +1125,10 @@ __kernel
{
sdxabs[tid] = fabs(sdx[tid]); // |dx| array
sdyabs[tid] = fabs(sdy[tid]); // |dy| array
barrier(CLK_LOCAL_MEM_FENCE);
//barrier(CLK_LOCAL_MEM_FENCE);
reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid);
barrier(CLK_LOCAL_MEM_FENCE);
//barrier(CLK_LOCAL_MEM_FENCE);
volatile __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 2);
@@ -1085,11 +1144,14 @@ __kernel
}
__kernel
void compute_descriptors128(
image2d_t imgTex,
IMAGE_INT8 imgTex,
__global volatile float * descriptors,
__global float * keypoints,
int descriptors_step,
int keypoints_step
int keypoints_step,
int rows,
int cols,
int img_step
)
{
descriptors_step /= sizeof(*descriptors);
@@ -1111,7 +1173,7 @@ __kernel
volatile __local float sdabs2[25];
volatile __local float s_PATCH[6][6];
calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir);
calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir, rows, cols, img_step);
barrier(CLK_LOCAL_MEM_FENCE);
const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0);
@@ -1132,10 +1194,10 @@ __kernel
sd2[tid] = sdx[tid];
sdabs2[tid] = fabs(sdx[tid]);
}
barrier(CLK_LOCAL_MEM_FENCE);
//barrier(CLK_LOCAL_MEM_FENCE);
reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
barrier(CLK_LOCAL_MEM_FENCE);
//barrier(CLK_LOCAL_MEM_FENCE);
volatile __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 3);
@@ -1162,10 +1224,10 @@ __kernel
sd2[tid] = sdy[tid];
sdabs2[tid] = fabs(sdy[tid]);
}
barrier(CLK_LOCAL_MEM_FENCE);
//barrier(CLK_LOCAL_MEM_FENCE);
reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
barrier(CLK_LOCAL_MEM_FENCE);
//barrier(CLK_LOCAL_MEM_FENCE);
// write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)
if (tid == 0)

View File

@@ -68,6 +68,8 @@ namespace cv
extern const char *operator_setTo;
extern const char *operator_setToM;
extern const char *convertC3C4;
extern DevMemType gDeviceMemType;
extern DevMemRW gDeviceMemRW;
}
}
@@ -911,7 +913,17 @@ oclMat cv::ocl::oclMat::reshape(int new_cn, int new_rows) const
}
void cv::ocl::oclMat::createEx(Size size, int type, DevMemRW rw_type, DevMemType mem_type)
{
createEx(size.height, size.width, type, rw_type, mem_type);
}
void cv::ocl::oclMat::create(int _rows, int _cols, int _type)
{
createEx(_rows, _cols, _type, gDeviceMemRW, gDeviceMemType);
}
void cv::ocl::oclMat::createEx(int _rows, int _cols, int _type, DevMemRW rw_type, DevMemType mem_type)
{
clCxt = Context::getContext();
/* core logic */
@@ -936,7 +948,7 @@ void cv::ocl::oclMat::create(int _rows, int _cols, int _type)
size_t esz = elemSize();
void *dev_ptr;
openCLMallocPitch(clCxt, &dev_ptr, &step, GPU_MATRIX_MALLOC_STEP(esz * cols), rows);
openCLMallocPitchEx(clCxt, &dev_ptr, &step, GPU_MATRIX_MALLOC_STEP(esz * cols), rows, rw_type, mem_type);
//openCLMallocPitch(clCxt,&dev_ptr, &step, esz * cols, rows);
if (esz * cols == step)

View File

@@ -217,6 +217,36 @@ namespace cv
{
openCLFree(texture);
}
bool support_image2d(Context *clCxt)
{
static const char * _kernel_string = "__kernel void test_func(image2d_t img) {}";
static bool _isTested = false;
static bool _support = false;
if(_isTested)
{
return _support;
}
try
{
cv::ocl::openCLGetKernelFromSource(clCxt, &_kernel_string, "test_func");
_support = true;
}
catch (const cv::Exception& e)
{
if(e.code == -217)
{
_support = false;
}
else
{
// throw e once again
throw e;
}
}
_isTested = true;
return _support;
}
}//namespace ocl
}//namespace cv

View File

@@ -69,6 +69,10 @@ namespace cv
// 2. for faster clamping, there is no buffer padding for the constructed texture
cl_mem bindTexture(const oclMat &mat);
void releaseTexture(cl_mem& texture);
// returns whether the current context supports image2d_t format or not
bool support_image2d(Context *clCxt = Context::getContext());
}//namespace ocl
}//namespace cv

379
modules/ocl/src/moments.cpp Normal file
View File

@@ -0,0 +1,379 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Sen Liu, sen@multicorewareinc.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other oclMaterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
#include <iostream>
namespace cv
{
namespace ocl
{
extern const char *moments;
// The function calculates center of gravity and the central second order moments
static void icvCompleteMomentState( CvMoments* moments )
{
double cx = 0, cy = 0;
double mu20, mu11, mu02;
assert( moments != 0 );
moments->inv_sqrt_m00 = 0;
if( fabs(moments->m00) > DBL_EPSILON )
{
double inv_m00 = 1. / moments->m00;
cx = moments->m10 * inv_m00;
cy = moments->m01 * inv_m00;
moments->inv_sqrt_m00 = std::sqrt( fabs(inv_m00) );
}
// mu20 = m20 - m10*cx
mu20 = moments->m20 - moments->m10 * cx;
// mu11 = m11 - m10*cy
mu11 = moments->m11 - moments->m10 * cy;
// mu02 = m02 - m01*cy
mu02 = moments->m02 - moments->m01 * cy;
moments->mu20 = mu20;
moments->mu11 = mu11;
moments->mu02 = mu02;
// mu30 = m30 - cx*(3*mu20 + cx*m10)
moments->mu30 = moments->m30 - cx * (3 * mu20 + cx * moments->m10);
mu11 += mu11;
// mu21 = m21 - cx*(2*mu11 + cx*m01) - cy*mu20
moments->mu21 = moments->m21 - cx * (mu11 + cx * moments->m01) - cy * mu20;
// mu12 = m12 - cy*(2*mu11 + cy*m10) - cx*mu02
moments->mu12 = moments->m12 - cy * (mu11 + cy * moments->m10) - cx * mu02;
// mu03 = m03 - cy*(3*mu02 + cy*m01)
moments->mu03 = moments->m03 - cy * (3 * mu02 + cy * moments->m01);
}
static void icvContourMoments( CvSeq* contour, CvMoments* mom )
{
if( contour->total )
{
CvSeqReader reader;
int lpt = contour->total;
double a00, a10, a01, a20, a11, a02, a30, a21, a12, a03;
int dst_type = cv::ocl::Context::getContext()->impl->double_support ? CV_64FC1 : CV_32FC1;
cvStartReadSeq( contour, &reader, 0 );
cv::ocl::oclMat dst_a00(1,lpt,dst_type);
cv::ocl::oclMat dst_a10(1,lpt,dst_type);
cv::ocl::oclMat dst_a01(1,lpt,dst_type);
cv::ocl::oclMat dst_a20(1,lpt,dst_type);
cv::ocl::oclMat dst_a11(1,lpt,dst_type);
cv::ocl::oclMat dst_a02(1,lpt,dst_type);
cv::ocl::oclMat dst_a30(1,lpt,dst_type);
cv::ocl::oclMat dst_a21(1,lpt,dst_type);
cv::ocl::oclMat dst_a12(1,lpt,dst_type);
cv::ocl::oclMat dst_a03(1,lpt,dst_type);
size_t reader_size = lpt << 1;
cv::Mat reader_mat(1,reader_size,CV_32FC1);
bool is_float = CV_SEQ_ELTYPE(contour) == CV_32FC2;
if( is_float )
{
for(size_t i = 0; i < reader_size; ++i)
{
reader_mat.at<float>(0, i++) = ((CvPoint2D32f*)(reader.ptr))->x;
reader_mat.at<float>(0, i) = ((CvPoint2D32f*)(reader.ptr))->y;
CV_NEXT_SEQ_ELEM( contour->elem_size, reader );
}
}
else
{
for(size_t i = 0; i < reader_size; ++i)
{
reader_mat.at<float>(0, i++) = ((CvPoint*)(reader.ptr))->x;
reader_mat.at<float>(0, i) = ((CvPoint*)(reader.ptr))->y;
CV_NEXT_SEQ_ELEM( contour->elem_size, reader );
}
}
cv::ocl::oclMat reader_oclmat(reader_mat);
int llength = std::min(lpt,128);
size_t localThreads[3] = { llength, 1, 1};
size_t globalThreads[3] = { lpt, 1, 1};
std::vector<std::pair<size_t , const void *> > args;
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&contour->total ));
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&reader_oclmat.data ));
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_a00.data ));
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_a10.data ));
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_a01.data ));
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_a20.data ));
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_a11.data ));
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_a02.data ));
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_a30.data ));
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_a21.data ));
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_a12.data ));
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_a03.data ));
openCLExecuteKernel(dst_a00.clCxt, &moments, "icvContourMoments", globalThreads, localThreads, args, -1, -1);
cv::Mat dst(dst_a00);
cv::Scalar s = cv::sum(dst);
a00 = s[0];
dst = dst_a10;
s = cv::sum(dst);
a10 = s[0];//dstsum[1];
dst = dst_a01;
s = cv::sum(dst);
a01 = s[0];//dstsum[2];
dst = dst_a20;
s = cv::sum(dst);
a20 = s[0];//dstsum[3];
dst = dst_a11;
s = cv::sum(dst);
a11 = s[0];//dstsum[4];
dst = dst_a02;
s = cv::sum(dst);
a02 = s[0];//dstsum[5];
dst = dst_a30;
s = cv::sum(dst);
a30 = s[0];//dstsum[6];
dst = dst_a21;
s = cv::sum(dst);
a21 = s[0];//dstsum[7];
dst = dst_a12;
s = cv::sum(dst);
a12 = s[0];//dstsum[8];
dst = dst_a03;
s = cv::sum(dst);
a03 = s[0];//dstsum[9];
double db1_2, db1_6, db1_12, db1_24, db1_20, db1_60;
if( fabs(a00) > FLT_EPSILON )
{
if( a00 > 0 )
{
db1_2 = 0.5;
db1_6 = 0.16666666666666666666666666666667;
db1_12 = 0.083333333333333333333333333333333;
db1_24 = 0.041666666666666666666666666666667;
db1_20 = 0.05;
db1_60 = 0.016666666666666666666666666666667;
}
else
{
db1_2 = -0.5;
db1_6 = -0.16666666666666666666666666666667;
db1_12 = -0.083333333333333333333333333333333;
db1_24 = -0.041666666666666666666666666666667;
db1_20 = -0.05;
db1_60 = -0.016666666666666666666666666666667;
}
// spatial moments
mom->m00 = a00 * db1_2;
mom->m10 = a10 * db1_6;
mom->m01 = a01 * db1_6;
mom->m20 = a20 * db1_12;
mom->m11 = a11 * db1_24;
mom->m02 = a02 * db1_12;
mom->m30 = a30 * db1_20;
mom->m21 = a21 * db1_60;
mom->m12 = a12 * db1_60;
mom->m03 = a03 * db1_20;
icvCompleteMomentState( mom );
}
}
}
static void ocl_cvMoments( const void* array, CvMoments* mom, int binary )
{
const int TILE_SIZE = 256;
int type, depth, cn, coi = 0;
CvMat stub, *mat = (CvMat*)array;
CvContour contourHeader;
CvSeq* contour = 0;
CvSeqBlock block;
if( CV_IS_SEQ( array ))
{
contour = (CvSeq*)array;
if( !CV_IS_SEQ_POINT_SET( contour ))
CV_Error( CV_StsBadArg, "The passed sequence is not a valid contour" );
}
if( !moments )
CV_Error( CV_StsNullPtr, "" );
memset( mom, 0, sizeof(*mom));
if( !contour )
{
mat = cvGetMat( mat, &stub, &coi );
type = CV_MAT_TYPE( mat->type );
if( type == CV_32SC2 || type == CV_32FC2 )
{
contour = cvPointSeqFromMat(
CV_SEQ_KIND_CURVE | CV_SEQ_FLAG_CLOSED,
mat, &contourHeader, &block );
}
}
if( contour )
{
icvContourMoments( contour, mom );
return;
}
type = CV_MAT_TYPE( mat->type );
depth = CV_MAT_DEPTH( type );
cn = CV_MAT_CN( type );
cv::Size size = cvGetMatSize( mat );
if( cn > 1 && coi == 0 )
CV_Error( CV_StsBadArg, "Invalid image type" );
if( size.width <= 0 || size.height <= 0 )
return;
cv::Mat src0(mat);
cv::ocl::oclMat src(src0);
cv::Size tileSize;
int blockx,blocky;
if(size.width%TILE_SIZE == 0)
blockx = size.width/TILE_SIZE;
else
blockx = size.width/TILE_SIZE + 1;
if(size.height%TILE_SIZE == 0)
blocky = size.height/TILE_SIZE;
else
blocky = size.height/TILE_SIZE + 1;
cv::ocl::oclMat dst_m00(blocky, blockx, CV_64FC1);
cv::ocl::oclMat dst_m10(blocky, blockx, CV_64FC1);
cv::ocl::oclMat dst_m01(blocky, blockx, CV_64FC1);
cv::ocl::oclMat dst_m20(blocky, blockx, CV_64FC1);
cv::ocl::oclMat dst_m11(blocky, blockx, CV_64FC1);
cv::ocl::oclMat dst_m02(blocky, blockx, CV_64FC1);
cv::ocl::oclMat dst_m30(blocky, blockx, CV_64FC1);
cv::ocl::oclMat dst_m21(blocky, blockx, CV_64FC1);
cv::ocl::oclMat dst_m12(blocky, blockx, CV_64FC1);
cv::ocl::oclMat dst_m03(blocky, blockx, CV_64FC1);
cl_mem sum = openCLCreateBuffer(src.clCxt,CL_MEM_READ_WRITE,10*sizeof(double));
int tile_width = std::min(size.width,TILE_SIZE);
int tile_height = std::min(size.height,TILE_SIZE);
size_t localThreads[3] = { tile_height, 1, 1};
size_t globalThreads[3] = { size.height, blockx, 1};
std::vector<std::pair<size_t , const void *> > args,args_sum;
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data ));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows ));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.cols ));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.step ));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&tileSize.width ));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&tileSize.height ));
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m00.data ));
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m10.data ));
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m01.data ));
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m20.data ));
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m11.data ));
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m02.data ));
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m30.data ));
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m21.data ));
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m12.data ));
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m03.data ));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_m00.cols ));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_m00.step ));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&type ));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&depth ));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&cn ));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&coi ));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&binary ));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&TILE_SIZE ));
openCLExecuteKernel(dst_m00.clCxt, &moments, "CvMoments", globalThreads, localThreads, args, -1, depth);
size_t localThreadss[3] = { 128, 1, 1};
size_t globalThreadss[3] = { 128, 1, 1};
args_sum.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows ));
args_sum.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.cols ));
args_sum.push_back( std::make_pair( sizeof(cl_int) , (void *)&tile_height ));
args_sum.push_back( std::make_pair( sizeof(cl_int) , (void *)&tile_width ));
args_sum.push_back( std::make_pair( sizeof(cl_int) , (void *)&TILE_SIZE ));
args_sum.push_back( std::make_pair( sizeof(cl_mem) , (void *)&sum ));
args_sum.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m00.data ));
args_sum.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m10.data ));
args_sum.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m01.data ));
args_sum.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m20.data ));
args_sum.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m11.data ));
args_sum.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m02.data ));
args_sum.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m30.data ));
args_sum.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m21.data ));
args_sum.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m12.data ));
args_sum.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m03.data ));
openCLExecuteKernel(dst_m00.clCxt, &moments, "dst_sum", globalThreadss, localThreadss, args_sum, -1, -1);
double* dstsum = new double[10];
memset(dstsum,0,10*sizeof(double));
openCLReadBuffer(dst_m00.clCxt,sum,(void *)dstsum,10*sizeof(double));
mom->m00 = dstsum[0];
mom->m10 = dstsum[1];
mom->m01 = dstsum[2];
mom->m20 = dstsum[3];
mom->m11 = dstsum[4];
mom->m02 = dstsum[5];
mom->m30 = dstsum[6];
mom->m21 = dstsum[7];
mom->m12 = dstsum[8];
mom->m03 = dstsum[9];
icvCompleteMomentState( mom );
}
Moments ocl_moments( InputArray _array, bool binaryImage )
{
CvMoments om;
Mat arr = _array.getMat();
CvMat c_array = arr;
ocl_cvMoments(&c_array, &om, binaryImage);
return om;
}
}
}

View File

@@ -93,6 +93,8 @@ namespace cv
///////////////////////////OpenCL call wrappers////////////////////////////
void openCLMallocPitch(Context *clCxt, void **dev_ptr, size_t *pitch,
size_t widthInBytes, size_t height);
void openCLMallocPitchEx(Context *clCxt, void **dev_ptr, size_t *pitch,
size_t widthInBytes, size_t height, DevMemRW rw_type, DevMemType mem_type);
void openCLMemcpy2D(Context *clCxt, void *dst, size_t dpitch,
const void *src, size_t spitch,
size_t width, size_t height, enum openCLMemcpyKind kind, int channels = -1);
@@ -141,6 +143,7 @@ namespace cv
//extra options to recognize vendor specific fp64 extensions
char extra_options[512];
std::string Binpath;
int unified_memory; //1 means integrated GPU, otherwise this value is 0
};
}
}

View File

@@ -573,8 +573,9 @@ static void lkSparse_run(oclMat &I, oclMat &J,
Context *clCxt = I.clCxt;
int elemCntPerRow = I.step / I.elemSize();
std::string kernelName = "lkSparse";
size_t localThreads[3] = { 8, 8, 1 };
size_t globalThreads[3] = { 8 * ptcount, 8, 1};
bool isImageSupported = support_image2d();
size_t localThreads[3] = { 8, isImageSupported ? 8 : 32, 1 };
size_t globalThreads[3] = { 8 * ptcount, isImageSupported ? 8 : 32, 1};
int cn = I.oclchannels();
char calcErr;
if (level == 0)
@@ -587,8 +588,9 @@ static void lkSparse_run(oclMat &I, oclMat &J,
}
std::vector<std::pair<size_t , const void *> > args;
cl_mem ITex = bindTexture(I);
cl_mem JTex = bindTexture(J);
cl_mem ITex = isImageSupported ? bindTexture(I) : (cl_mem)I.data;
cl_mem JTex = isImageSupported ? bindTexture(J) : (cl_mem)J.data;
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&ITex ));
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&JTex ));
@@ -601,6 +603,8 @@ static void lkSparse_run(oclMat &I, oclMat &J,
args.push_back( std::make_pair( sizeof(cl_int), (void *)&level ));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&I.rows ));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&I.cols ));
if (!isImageSupported)
args.push_back( std::make_pair( sizeof(cl_int), (void *)&elemCntPerRow ) );
args.push_back( std::make_pair( sizeof(cl_int), (void *)&patch.x ));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&patch.y ));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&cn ));
@@ -609,19 +613,14 @@ static void lkSparse_run(oclMat &I, oclMat &J,
args.push_back( std::make_pair( sizeof(cl_int), (void *)&iters ));
args.push_back( std::make_pair( sizeof(cl_char), (void *)&calcErr ));
try
if(isImageSupported)
{
openCLExecuteKernel2(clCxt, &pyrlk, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth(), CLFLUSH);
}
catch(Exception&)
{
printf("Warning: The image2d_t is not supported by the device. Using alternative method!\n");
releaseTexture(ITex);
releaseTexture(JTex);
ITex = (cl_mem)I.data;
JTex = (cl_mem)J.data;
localThreads[1] = globalThreads[1] = 32;
args.insert( args.begin()+11, std::make_pair( sizeof(cl_int), (void *)&elemCntPerRow ) );
}
else
{
openCLExecuteKernel2(clCxt, &pyrlk_no_image, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth(), CLFLUSH);
}
}
@@ -723,7 +722,7 @@ static void lkDense_run(oclMat &I, oclMat &J, oclMat &u, oclMat &v,
oclMat &prevU, oclMat &prevV, oclMat *err, Size winSize, int iters)
{
Context *clCxt = I.clCxt;
bool isImageSupported = clCxt->impl->devName.find("Intel(R) HD Graphics") == std::string::npos;
bool isImageSupported = support_image2d();
int elemCntPerRow = I.step / I.elemSize();
std::string kernelName = "lkDense";

View File

@@ -1,4 +1,4 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
/*M/////////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
@@ -44,6 +44,7 @@
//M*/
#include <iomanip>
#include "precomp.hpp"
#include "mcwutil.hpp"
//#include "opencv2/highgui/highgui.hpp"
using namespace cv;
@@ -70,7 +71,7 @@ static inline int calcSize(int octave, int layer)
/* Wavelet size increment between layers. This should be an even number,
such that the wavelet sizes in an octave are either all even or all odd.
This ensures that when looking for the neighbours of a sample, the layers
This ensures that when looking for the neighbors of a sample, the layers
above and below are aligned correctly. */
const int HAAR_SIZE_INC = 6;
@@ -78,6 +79,11 @@ static inline int calcSize(int octave, int layer)
return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;
}
namespace
{
const char* noImage2dOption = "-D DISABLE_IMAGE2D";
}
class SURF_OCL_Invoker
{
public:
@@ -87,7 +93,7 @@ public:
//void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold);
//void loadOctaveConstants(int octave, int layer_rows, int layer_cols);
// kernel callers declearations
// kernel callers declarations
void icvCalcLayerDetAndTrace_gpu(oclMat &det, oclMat &trace, int octave, int nOctaveLayers, int layer_rows);
void icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat &trace, oclMat &maxPosBuffer, oclMat &maxCounter, int counterOffset,
@@ -99,14 +105,14 @@ public:
void icvCalcOrientation_gpu(const oclMat &keypoints, int nFeatures);
void compute_descriptors_gpu(const oclMat &descriptors, const oclMat &keypoints, int nFeatures);
// end of kernel callers declearations
// end of kernel callers declarations
SURF_OCL_Invoker(SURF_OCL &surf, const oclMat &img, const oclMat &mask) :
surf_(surf),
img_cols(img.cols), img_rows(img.rows),
use_mask(!mask.empty()),
imgTex(NULL), sumTex(NULL), maskSumTex(NULL)
use_mask(!mask.empty()), counters(oclMat()),
imgTex(NULL), sumTex(NULL), maskSumTex(NULL), _img(img)
{
CV_Assert(!img.empty() && img.type() == CV_8UC1);
CV_Assert(mask.empty() || (mask.size() == img.size() && mask.type() == CV_8UC1));
@@ -130,12 +136,13 @@ public:
counters.create(1, surf_.nOctaves + 1, CV_32SC1);
counters.setTo(Scalar::all(0));
//loadGlobalConstants(maxCandidates, maxFeatures, img_rows, img_cols, surf_.nOctaveLayers, static_cast<float>(surf_.hessianThreshold));
integral(img, surf_.sum);
if(support_image2d())
{
bindImgTex(img, imgTex);
integral(img, surf_.sum); // the two argumented integral version is incorrect
bindImgTex(surf_.sum, sumTex);
}
bindImgTex(surf_.sum, sumTex);
maskSumTex = 0;
if (use_mask)
@@ -154,7 +161,7 @@ public:
void detectKeypoints(oclMat &keypoints)
{
// create image pyramid buffers
// different layers have same sized buffers, but they are sampled from gaussin kernel.
// different layers have same sized buffers, but they are sampled from Gaussian kernel.
ensureSizeIsEnough(img_rows * (surf_.nOctaveLayers + 2), img_cols, CV_32FC1, surf_.det);
ensureSizeIsEnough(img_rows * (surf_.nOctaveLayers + 2), img_cols, CV_32FC1, surf_.trace);
@@ -221,7 +228,6 @@ public:
openCLFree(sumTex);
if(maskSumTex)
openCLFree(maskSumTex);
additioalParamBuffer.release();
}
private:
@@ -241,7 +247,7 @@ private:
cl_mem sumTex;
cl_mem maskSumTex;
oclMat additioalParamBuffer;
const oclMat _img; // make a copy for non-image2d_t supported platform
SURF_OCL_Invoker &operator= (const SURF_OCL_Invoker &right)
{
@@ -361,11 +367,6 @@ void cv::ocl::SURF_OCL::operator()(const oclMat &img, const oclMat &mask, oclMat
{
if (!img.empty())
{
if (img.clCxt->impl->devName.find("Intel(R) HD Graphics") != std::string::npos)
{
std::cout << " Intel HD GPU device unsupported " << std::endl;
return;
}
SURF_OCL_Invoker surf(*this, img, mask);
surf.detectKeypoints(keypoints);
@@ -377,11 +378,6 @@ void cv::ocl::SURF_OCL::operator()(const oclMat &img, const oclMat &mask, oclMat
{
if (!img.empty())
{
if (img.clCxt->impl->devName.find("Intel(R) HD Graphics") != std::string::npos)
{
std::cout << " Intel HD GPU device unsupported " << std::endl;
return;
}
SURF_OCL_Invoker surf(*this, img, mask);
if (!useProvidedKeypoints)
@@ -442,74 +438,11 @@ void cv::ocl::SURF_OCL::releaseMemory()
// bind source buffer to image oject.
void SURF_OCL_Invoker::bindImgTex(const oclMat &img, cl_mem &texture)
{
cl_image_format format;
int err;
int depth = img.depth();
int channels = img.channels();
switch(depth)
{
case CV_8U:
format.image_channel_data_type = CL_UNSIGNED_INT8;
break;
case CV_32S:
format.image_channel_data_type = CL_UNSIGNED_INT32;
break;
case CV_32F:
format.image_channel_data_type = CL_FLOAT;
break;
default:
throw std::exception();
break;
}
switch(channels)
{
case 1:
format.image_channel_order = CL_R;
break;
case 3:
format.image_channel_order = CL_RGB;
break;
case 4:
format.image_channel_order = CL_RGBA;
break;
default:
throw std::exception();
break;
}
if(texture)
{
openCLFree(texture);
}
#ifdef CL_VERSION_1_2
cl_image_desc desc;
desc.image_type = CL_MEM_OBJECT_IMAGE2D;
desc.image_width = img.step / img.elemSize();
desc.image_height = img.rows;
desc.image_depth = 0;
desc.image_array_size = 1;
desc.image_row_pitch = 0;
desc.image_slice_pitch = 0;
desc.buffer = NULL;
desc.num_mip_levels = 0;
desc.num_samples = 0;
texture = clCreateImage(Context::getContext()->impl->clContext, CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
#else
texture = clCreateImage2D(
Context::getContext()->impl->clContext,
CL_MEM_READ_WRITE,
&format,
img.step / img.elemSize(),
img.rows,
0,
NULL,
&err);
#endif
size_t origin[] = { 0, 0, 0 };
size_t region[] = { img.step / img.elemSize(), img.rows, 1 };
clEnqueueCopyBufferToImage(img.clCxt->impl->clCmdQueue, (cl_mem)img.data, texture, 0, origin, region, 0, NULL, 0);
openCLSafeCall(err);
texture = bindTexture(img);
}
////////////////////////////
@@ -524,7 +457,14 @@ void SURF_OCL_Invoker::icvCalcLayerDetAndTrace_gpu(oclMat &det, oclMat &trace, i
std::string kernelName = "icvCalcLayerDetAndTrace";
std::vector< std::pair<size_t, const void *> > args;
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&sumTex));
if(sumTex)
{
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&sumTex));
}
else
{
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&surf_.sum.data)); // if image2d is not supported
}
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&det.data));
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&trace.data));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&det.step));
@@ -534,6 +474,7 @@ void SURF_OCL_Invoker::icvCalcLayerDetAndTrace_gpu(oclMat &det, oclMat &trace, i
args.push_back( std::make_pair( sizeof(cl_int), (void *)&nOctaveLayers));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&octave));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&c_layer_rows));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&surf_.sum.step));
size_t localThreads[3] = {16, 16, 1};
size_t globalThreads[3] =
@@ -542,8 +483,15 @@ void SURF_OCL_Invoker::icvCalcLayerDetAndTrace_gpu(oclMat &det, oclMat &trace, i
divUp(max_samples_i, localThreads[1]) *localThreads[1] *(nOctaveLayers + 2),
1
};
if(support_image2d())
{
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
}
else
{
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
}
}
void SURF_OCL_Invoker::icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat &trace, oclMat &maxPosBuffer, oclMat &maxCounter, int counterOffset,
int octave, bool use_mask, int nLayers, int layer_rows, int layer_cols)
@@ -572,17 +520,31 @@ void SURF_OCL_Invoker::icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat
if(use_mask)
{
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&maskSumTex));
if(maskSumTex)
{
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&maskSumTex));
}
else
{
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&surf_.maskSum.data));
}
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&surf_.maskSum.step));
}
size_t localThreads[3] = {16, 16, 1};
size_t globalThreads[3] = {divUp(layer_cols - 2 * min_margin, localThreads[0] - 2) *localThreads[0],
divUp(layer_rows - 2 * min_margin, localThreads[1] - 2) *nLayers *localThreads[1],
1
};
if(support_image2d())
{
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
}
else
{
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
}
}
void SURF_OCL_Invoker::icvInterpolateKeypoint_gpu(const oclMat &det, const oclMat &maxPosBuffer, unsigned int maxCounter,
oclMat &keypoints, oclMat &counters, int octave, int layer_rows, int maxFeatures)
@@ -606,8 +568,15 @@ void SURF_OCL_Invoker::icvInterpolateKeypoint_gpu(const oclMat &det, const oclMa
size_t localThreads[3] = {3, 3, 3};
size_t globalThreads[3] = {maxCounter *localThreads[0], localThreads[1], 1};
if(support_image2d())
{
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
}
else
{
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
}
}
void SURF_OCL_Invoker::icvCalcOrientation_gpu(const oclMat &keypoints, int nFeatures)
{
@@ -616,17 +585,32 @@ void SURF_OCL_Invoker::icvCalcOrientation_gpu(const oclMat &keypoints, int nFeat
std::vector< std::pair<size_t, const void *> > args;
if(sumTex)
{
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&sumTex));
}
else
{
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&surf_.sum.data)); // if image2d is not supported
}
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&keypoints.data));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&keypoints.step));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_rows));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_cols));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&surf_.sum.step));
size_t localThreads[3] = {32, 4, 1};
size_t globalThreads[3] = {nFeatures *localThreads[0], localThreads[1], 1};
if(support_image2d())
{
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
}
else
{
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
}
}
void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const oclMat &keypoints, int nFeatures)
{
@@ -648,12 +632,29 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
globalThreads[1] = 16 * localThreads[1];
args.clear();
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&imgTex));
if(imgTex)
{
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&imgTex));
}
else
{
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&_img.data));
}
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&descriptors.data));
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&keypoints.data));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&descriptors.step));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&keypoints.step));
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.rows));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.cols));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.step));
if(support_image2d())
{
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
}
else
{
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
}
kernelName = "normalize_descriptors64";
@@ -666,9 +667,16 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
args.clear();
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&descriptors.data));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&descriptors.step));
if(support_image2d())
{
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
}
else
{
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
}
}
else
{
kernelName = "compute_descriptors128";
@@ -679,12 +687,29 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
globalThreads[1] = 16 * localThreads[1];
args.clear();
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&imgTex));
if(imgTex)
{
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&imgTex));
}
else
{
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&_img.data));
}
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&descriptors.data));
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&keypoints.data));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&descriptors.step));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&keypoints.step));
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.rows));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.cols));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.step));
if(support_image2d())
{
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
}
else
{
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
}
kernelName = "normalize_descriptors128";
@@ -697,7 +722,14 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
args.clear();
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&descriptors.data));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&descriptors.step));
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
if(support_image2d())
{
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
}
else
{
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
}
}
}