Merge pull request #1675 from krodyush:opencl-optimization
This commit is contained in:
commit
d11921f1a4
@ -111,6 +111,7 @@ namespace cv
|
||||
|
||||
bool haveDoubleSupport;
|
||||
bool isUnifiedMemory; // 1 means integrated GPU, otherwise this value is 0
|
||||
bool isIntelDevice;
|
||||
|
||||
std::string compilationExtraOptions;
|
||||
|
||||
@ -157,7 +158,8 @@ namespace cv
|
||||
{
|
||||
FEATURE_CL_DOUBLE = 1,
|
||||
FEATURE_CL_UNIFIED_MEM,
|
||||
FEATURE_CL_VER_1_2
|
||||
FEATURE_CL_VER_1_2,
|
||||
FEATURE_CL_INTEL_DEVICE
|
||||
};
|
||||
|
||||
// Represents OpenCL context, interface
|
||||
|
@ -448,6 +448,17 @@ static int initializeOpenCLDevices()
|
||||
{
|
||||
deviceInfo.info.haveDoubleSupport = false;
|
||||
}
|
||||
|
||||
size_t intel_platform = platformInfo.info.platformVendor.find("Intel");
|
||||
if(intel_platform != std::string::npos)
|
||||
{
|
||||
deviceInfo.info.compilationExtraOptions += " -D INTEL_DEVICE";
|
||||
deviceInfo.info.isIntelDevice = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
deviceInfo.info.isIntelDevice = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -471,7 +482,7 @@ DeviceInfo::DeviceInfo()
|
||||
deviceVendorId(-1),
|
||||
maxWorkGroupSize(0), maxComputeUnits(0), localMemorySize(0), maxMemAllocSize(0),
|
||||
deviceVersionMajor(0), deviceVersionMinor(0),
|
||||
haveDoubleSupport(false), isUnifiedMemory(false),
|
||||
haveDoubleSupport(false), isUnifiedMemory(false),isIntelDevice(false),
|
||||
platform(NULL)
|
||||
{
|
||||
// nothing
|
||||
@ -572,6 +583,8 @@ bool ContextImpl::supportsFeature(FEATURE_TYPE featureType) const
|
||||
{
|
||||
switch (featureType)
|
||||
{
|
||||
case FEATURE_CL_INTEL_DEVICE:
|
||||
return deviceInfo.isIntelDevice;
|
||||
case FEATURE_CL_DOUBLE:
|
||||
return deviceInfo.haveDoubleSupport;
|
||||
case FEATURE_CL_UNIFIED_MEM:
|
||||
|
@ -849,16 +849,138 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
|
||||
args.push_back ( make_pair(sizeof(cl_int4) , (void *)&pq ));
|
||||
args.push_back ( make_pair(sizeof(cl_float) , (void *)&correction ));
|
||||
|
||||
const char * build_options = gcascade->is_stump_based ? "-D STUMP_BASED=1" : "-D STUMP_BASED=0";
|
||||
if(gcascade->is_stump_based && gsum.clCxt->supportsFeature(FEATURE_CL_INTEL_DEVICE))
|
||||
{
|
||||
//setup local group size
|
||||
localThreads[0] = 8;
|
||||
localThreads[1] = 16;
|
||||
localThreads[2] = 1;
|
||||
|
||||
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascade", globalThreads, localThreads, args, -1, -1, build_options);
|
||||
//init maximal number of workgroups
|
||||
int WGNumX = 1+(sizev[0].width /(localThreads[0]));
|
||||
int WGNumY = 1+(sizev[0].height/(localThreads[1]));
|
||||
int WGNumZ = loopcount;
|
||||
int WGNum = 0; //accurate number of non -empty workgroups
|
||||
oclMat oclWGInfo(1,sizeof(cl_int4) * WGNumX*WGNumY*WGNumZ,CV_8U);
|
||||
{
|
||||
cl_int4* pWGInfo = (cl_int4*)clEnqueueMapBuffer(getClCommandQueue(oclWGInfo.clCxt),(cl_mem)oclWGInfo.datastart,true,CL_MAP_WRITE, 0, oclWGInfo.step, 0,0,0,&status);
|
||||
openCLVerifyCall(status);
|
||||
for(int z=0;z<WGNumZ;++z)
|
||||
{
|
||||
int Width = (scaleinfo[z].width_height >> 16)&0xFFFF;
|
||||
int Height = (scaleinfo[z].width_height >> 0 )& 0xFFFF;
|
||||
for(int y=0;y<WGNumY;++y)
|
||||
{
|
||||
int gy = y*localThreads[1];
|
||||
if(gy>=(Height-cascade->orig_window_size.height))
|
||||
continue; // no data to process
|
||||
for(int x=0;x<WGNumX;++x)
|
||||
{
|
||||
int gx = x*localThreads[0];
|
||||
if(gx>=(Width-cascade->orig_window_size.width))
|
||||
continue; // no data to process
|
||||
|
||||
openCLReadBuffer( gsum.clCxt, candidatebuffer, candidate, 4 * sizeof(int)*outputsz );
|
||||
// save no-empty workgroup info into array
|
||||
pWGInfo[WGNum].s[0] = scaleinfo[z].width_height;
|
||||
pWGInfo[WGNum].s[1] = (gx << 16) | gy;
|
||||
pWGInfo[WGNum].s[2] = scaleinfo[z].imgoff;
|
||||
memcpy(&(pWGInfo[WGNum].s[3]),&(scaleinfo[z].factor),sizeof(float));
|
||||
WGNum++;
|
||||
}
|
||||
}
|
||||
}
|
||||
openCLSafeCall(clEnqueueUnmapMemObject(getClCommandQueue(oclWGInfo.clCxt),(cl_mem)oclWGInfo.datastart,pWGInfo,0,0,0));
|
||||
pWGInfo = NULL;
|
||||
}
|
||||
|
||||
for(int i = 0; i < outputsz; i++)
|
||||
if(candidate[4 * i + 2] != 0)
|
||||
allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1],
|
||||
candidate[4 * i + 2], candidate[4 * i + 3]));
|
||||
// setup global sizes to have linear array of workgroups with WGNum size
|
||||
globalThreads[0] = localThreads[0]*WGNum;
|
||||
globalThreads[1] = localThreads[1];
|
||||
globalThreads[2] = 1;
|
||||
|
||||
#define NODE_SIZE 12
|
||||
// pack node info to have less memory loads
|
||||
oclMat oclNodesPK(1,sizeof(cl_int) * NODE_SIZE * nodenum,CV_8U);
|
||||
{
|
||||
cl_int status;
|
||||
cl_int* pNodesPK = (cl_int*)clEnqueueMapBuffer(getClCommandQueue(oclNodesPK.clCxt),(cl_mem)oclNodesPK.datastart,true,CL_MAP_WRITE, 0, oclNodesPK.step, 0,0,0,&status);
|
||||
openCLVerifyCall(status);
|
||||
//use known local data stride to precalulate indexes
|
||||
int DATA_SIZE_X = (localThreads[0]+cascade->orig_window_size.width);
|
||||
// check that maximal value is less than maximal unsigned short
|
||||
assert(DATA_SIZE_X*cascade->orig_window_size.height+cascade->orig_window_size.width < USHRT_MAX);
|
||||
for(int i = 0;i<nodenum;++i)
|
||||
{//process each node from classifier
|
||||
struct NodePK
|
||||
{
|
||||
unsigned short slm_index[3][4];
|
||||
float weight[3];
|
||||
float threshold;
|
||||
float alpha[2];
|
||||
};
|
||||
struct NodePK * pOut = (struct NodePK *)(pNodesPK + NODE_SIZE*i);
|
||||
for(int k=0;k<3;++k)
|
||||
{// calc 4 short indexes in shared local mem for each rectangle instead of 2 (x,y) pair.
|
||||
int* p = &(node[i].p[k][0]);
|
||||
pOut->slm_index[k][0] = (unsigned short)(p[1]*DATA_SIZE_X+p[0]);
|
||||
pOut->slm_index[k][1] = (unsigned short)(p[1]*DATA_SIZE_X+p[2]);
|
||||
pOut->slm_index[k][2] = (unsigned short)(p[3]*DATA_SIZE_X+p[0]);
|
||||
pOut->slm_index[k][3] = (unsigned short)(p[3]*DATA_SIZE_X+p[2]);
|
||||
}
|
||||
//store used float point values for each node
|
||||
pOut->weight[0] = node[i].weight[0];
|
||||
pOut->weight[1] = node[i].weight[1];
|
||||
pOut->weight[2] = node[i].weight[2];
|
||||
pOut->threshold = node[i].threshold;
|
||||
pOut->alpha[0] = node[i].alpha[0];
|
||||
pOut->alpha[1] = node[i].alpha[1];
|
||||
}
|
||||
openCLSafeCall(clEnqueueUnmapMemObject(getClCommandQueue(oclNodesPK.clCxt),(cl_mem)oclNodesPK.datastart,pNodesPK,0,0,0));
|
||||
pNodesPK = NULL;
|
||||
}
|
||||
// add 2 additional buffers (WGinfo and packed nodes) as 2 last args
|
||||
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&oclNodesPK.datastart ));
|
||||
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&oclWGInfo.datastart ));
|
||||
|
||||
//form build options for kernel
|
||||
string options = "-D PACKED_CLASSIFIER";
|
||||
options += format(" -D NODE_SIZE=%d",NODE_SIZE);
|
||||
options += format(" -D WND_SIZE_X=%d",cascade->orig_window_size.width);
|
||||
options += format(" -D WND_SIZE_Y=%d",cascade->orig_window_size.height);
|
||||
options += format(" -D STUMP_BASED=%d",gcascade->is_stump_based);
|
||||
options += format(" -D LSx=%d",localThreads[0]);
|
||||
options += format(" -D LSy=%d",localThreads[1]);
|
||||
options += format(" -D SPLITNODE=%d",splitnode);
|
||||
options += format(" -D SPLITSTAGE=%d",splitstage);
|
||||
options += format(" -D OUTPUTSZ=%d",outputsz);
|
||||
|
||||
// init candiate global count by 0
|
||||
int pattern = 0;
|
||||
openCLSafeCall(clEnqueueWriteBuffer(qu, candidatebuffer, 1, 0, 1 * sizeof(pattern),&pattern, 0, NULL, NULL));
|
||||
// execute face detector
|
||||
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascadePacked", globalThreads, localThreads, args, -1, -1, options.c_str());
|
||||
//read candidate buffer back and put it into host list
|
||||
openCLReadBuffer( gsum.clCxt, candidatebuffer, candidate, 4 * sizeof(int)*outputsz );
|
||||
assert(candidate[0]<outputsz);
|
||||
//printf("candidate[0]=%d\n",candidate[0]);
|
||||
for(int i = 1; i <= candidate[0]; i++)
|
||||
{
|
||||
allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1],candidate[4 * i + 2], candidate[4 * i + 3]));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
const char * build_options = gcascade->is_stump_based ? "-D STUMP_BASED=1" : "-D STUMP_BASED=0";
|
||||
|
||||
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascade", globalThreads, localThreads, args, -1, -1, build_options);
|
||||
|
||||
openCLReadBuffer( gsum.clCxt, candidatebuffer, candidate, 4 * sizeof(int)*outputsz );
|
||||
|
||||
for(int i = 0; i < outputsz; i++)
|
||||
if(candidate[4 * i + 2] != 0)
|
||||
allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1],
|
||||
candidate[4 * i + 2], candidate[4 * i + 3]));
|
||||
}
|
||||
|
||||
free(scaleinfo);
|
||||
free(candidate);
|
||||
|
@ -876,8 +876,60 @@ namespace cv
|
||||
|
||||
if (ksize > 0)
|
||||
{
|
||||
Sobel(src, Dx, CV_32F, 1, 0, ksize, scale, 0, borderType);
|
||||
Sobel(src, Dy, CV_32F, 0, 1, ksize, scale, 0, borderType);
|
||||
Context* clCxt = Context::getContext();
|
||||
if(clCxt->supportsFeature(FEATURE_CL_INTEL_DEVICE) && src.type() == CV_8UC1 &&
|
||||
src.cols % 8 == 0 && src.rows % 8 == 0 &&
|
||||
ksize==3 &&
|
||||
(borderType ==cv::BORDER_REFLECT ||
|
||||
borderType == cv::BORDER_REPLICATE ||
|
||||
borderType ==cv::BORDER_REFLECT101 ||
|
||||
borderType ==cv::BORDER_WRAP))
|
||||
{
|
||||
Dx.create(src.size(), CV_32FC1);
|
||||
Dy.create(src.size(), CV_32FC1);
|
||||
|
||||
const unsigned int block_x = 8;
|
||||
const unsigned int block_y = 8;
|
||||
|
||||
unsigned int src_pitch = src.step;
|
||||
unsigned int dst_pitch = Dx.cols;
|
||||
|
||||
float _scale = scale;
|
||||
|
||||
std::vector<std::pair<size_t , const void *> > args;
|
||||
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data ));
|
||||
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&Dx.data ));
|
||||
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&Dy.data ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.cols ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows ));
|
||||
args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&src_pitch ));
|
||||
args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&dst_pitch ));
|
||||
args.push_back( std::make_pair( sizeof(cl_float) , (void *)&_scale ));
|
||||
size_t gt2[3] = {src.cols, src.rows, 1}, lt2[3] = {block_x, block_y, 1};
|
||||
|
||||
string option = "-D BLK_X=8 -D BLK_Y=8";
|
||||
switch(borderType)
|
||||
{
|
||||
case cv::BORDER_REPLICATE:
|
||||
option += " -D BORDER_REPLICATE";
|
||||
break;
|
||||
case cv::BORDER_REFLECT:
|
||||
option += " -D BORDER_REFLECT";
|
||||
break;
|
||||
case cv::BORDER_REFLECT101:
|
||||
option += " -D BORDER_REFLECT101";
|
||||
break;
|
||||
case cv::BORDER_WRAP:
|
||||
option += " -D BORDER_WRAP";
|
||||
break;
|
||||
}
|
||||
openCLExecuteKernel(src.clCxt, &imgproc_sobel3, "sobel3", gt2, lt2, args, -1, -1, option.c_str() );
|
||||
}
|
||||
else
|
||||
{
|
||||
Sobel(src, Dx, CV_32F, 1, 0, ksize, scale, 0, borderType);
|
||||
Sobel(src, Dy, CV_32F, 0, 1, ksize, scale, 0, borderType);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -101,6 +101,144 @@ typedef struct __attribute__((aligned (64))) GpuHidHaarClassifierCascade
|
||||
float inv_window_area __attribute__((aligned (4)));
|
||||
} GpuHidHaarClassifierCascade;
|
||||
|
||||
|
||||
#ifdef PACKED_CLASSIFIER
|
||||
// this code is scalar, one pixel -> one workitem
|
||||
__kernel void gpuRunHaarClassifierCascadePacked(
|
||||
global const GpuHidHaarStageClassifier * stagecascadeptr,
|
||||
global const int4 * info,
|
||||
global const GpuHidHaarTreeNode * nodeptr,
|
||||
global const int * restrict sum,
|
||||
global const float * restrict sqsum,
|
||||
volatile global int4 * candidate,
|
||||
const int pixelstep,
|
||||
const int loopcount,
|
||||
const int start_stage,
|
||||
const int split_stage,
|
||||
const int end_stage,
|
||||
const int startnode,
|
||||
const int splitnode,
|
||||
const int4 p,
|
||||
const int4 pq,
|
||||
const float correction,
|
||||
global const int* pNodesPK,
|
||||
global const int4* pWGInfo
|
||||
)
|
||||
|
||||
{
|
||||
// this version used information provided for each workgroup
|
||||
// no empty WG
|
||||
int gid = (int)get_group_id(0);
|
||||
int lid_x = (int)get_local_id(0);
|
||||
int lid_y = (int)get_local_id(1);
|
||||
int lid = lid_y*LSx+lid_x;
|
||||
int4 WGInfo = pWGInfo[gid];
|
||||
int GroupX = (WGInfo.y >> 16)&0xFFFF;
|
||||
int GroupY = (WGInfo.y >> 0 )& 0xFFFF;
|
||||
int Width = (WGInfo.x >> 16)&0xFFFF;
|
||||
int Height = (WGInfo.x >> 0 )& 0xFFFF;
|
||||
int ImgOffset = WGInfo.z;
|
||||
float ScaleFactor = as_float(WGInfo.w);
|
||||
|
||||
#define DATA_SIZE_X (LSx+WND_SIZE_X)
|
||||
#define DATA_SIZE_Y (LSy+WND_SIZE_Y)
|
||||
#define DATA_SIZE (DATA_SIZE_X*DATA_SIZE_Y)
|
||||
|
||||
local int SumL[DATA_SIZE];
|
||||
|
||||
// read input data window into local mem
|
||||
for(int i = 0; i<DATA_SIZE; i+=(LSx*LSy))
|
||||
{
|
||||
int index = i+lid; // index in shared local memory
|
||||
if(index<DATA_SIZE)
|
||||
{// calc global x,y coordinat and read data from there
|
||||
int x = min(GroupX + (index % (DATA_SIZE_X)),Width-1);
|
||||
int y = min(GroupY + (index / (DATA_SIZE_X)),Height-1);
|
||||
SumL[index] = sum[ImgOffset+y*pixelstep+x];
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// calc variance_norm_factor for all stages
|
||||
float variance_norm_factor;
|
||||
int nodecounter= startnode;
|
||||
int4 info1 = p;
|
||||
int4 info2 = pq;
|
||||
|
||||
{
|
||||
int xl = lid_x;
|
||||
int yl = lid_y;
|
||||
int OffsetLocal = yl * DATA_SIZE_X + xl;
|
||||
int OffsetGlobal = (GroupY+yl)* pixelstep + (GroupX+xl);
|
||||
|
||||
// add shift to get position on scaled image
|
||||
OffsetGlobal += ImgOffset;
|
||||
|
||||
float mean =
|
||||
SumL[info1.y*DATA_SIZE_X+info1.x+OffsetLocal] -
|
||||
SumL[info1.y*DATA_SIZE_X+info1.z+OffsetLocal] -
|
||||
SumL[info1.w*DATA_SIZE_X+info1.x+OffsetLocal] +
|
||||
SumL[info1.w*DATA_SIZE_X+info1.z+OffsetLocal];
|
||||
float sq =
|
||||
sqsum[info2.y*pixelstep+info2.x+OffsetGlobal] -
|
||||
sqsum[info2.y*pixelstep+info2.z+OffsetGlobal] -
|
||||
sqsum[info2.w*pixelstep+info2.x+OffsetGlobal] +
|
||||
sqsum[info2.w*pixelstep+info2.z+OffsetGlobal];
|
||||
|
||||
mean *= correction;
|
||||
sq *= correction;
|
||||
|
||||
variance_norm_factor = sq - mean * mean;
|
||||
variance_norm_factor = (variance_norm_factor >=0.f) ? sqrt(variance_norm_factor) : 1.f;
|
||||
}// end calc variance_norm_factor for all stages
|
||||
|
||||
int result = (1.0f>0.0f);
|
||||
for(int stageloop = start_stage; (stageloop < end_stage) && result; stageloop++ )
|
||||
{// iterate until candidate is exist
|
||||
float stage_sum = 0.0f;
|
||||
int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop);
|
||||
float stagethreshold = as_float(stageinfo.y);
|
||||
int lcl_off = (lid_y*DATA_SIZE_X)+(lid_x);
|
||||
for(int nodeloop = 0; nodeloop < stageinfo.x; nodecounter++,nodeloop++ )
|
||||
{
|
||||
// simple macro to extract shorts from int
|
||||
#define M0(_t) ((_t)&0xFFFF)
|
||||
#define M1(_t) (((_t)>>16)&0xFFFF)
|
||||
// load packed node data from global memory (L3) into registers
|
||||
global const int4* pN = (__global int4*)(pNodesPK+nodecounter*NODE_SIZE);
|
||||
int4 n0 = pN[0];
|
||||
int4 n1 = pN[1];
|
||||
int4 n2 = pN[2];
|
||||
float nodethreshold = as_float(n2.y) * variance_norm_factor;
|
||||
// calc sum of intensity pixels according to node information
|
||||
float classsum =
|
||||
(SumL[M0(n0.x)+lcl_off] - SumL[M1(n0.x)+lcl_off] - SumL[M0(n0.y)+lcl_off] + SumL[M1(n0.y)+lcl_off]) * as_float(n1.z) +
|
||||
(SumL[M0(n0.z)+lcl_off] - SumL[M1(n0.z)+lcl_off] - SumL[M0(n0.w)+lcl_off] + SumL[M1(n0.w)+lcl_off]) * as_float(n1.w) +
|
||||
(SumL[M0(n1.x)+lcl_off] - SumL[M1(n1.x)+lcl_off] - SumL[M0(n1.y)+lcl_off] + SumL[M1(n1.y)+lcl_off]) * as_float(n2.x);
|
||||
//accumulate stage responce
|
||||
stage_sum += (classsum >= nodethreshold) ? as_float(n2.w) : as_float(n2.z);
|
||||
}
|
||||
result = (stage_sum >= stagethreshold);
|
||||
}// next stage if needed
|
||||
|
||||
if(result)
|
||||
{// all stages will be passed and there is a detected face on the tested position
|
||||
int index = 1+atomic_inc((volatile global int*)candidate); //get index to write global data with face info
|
||||
if(index<OUTPUTSZ)
|
||||
{
|
||||
int x = GroupX+lid_x;
|
||||
int y = GroupY+lid_y;
|
||||
int4 candidate_result;
|
||||
candidate_result.x = convert_int_rtn(x*ScaleFactor);
|
||||
candidate_result.y = convert_int_rtn(y*ScaleFactor);
|
||||
candidate_result.z = convert_int_rtn(ScaleFactor*WND_SIZE_X);
|
||||
candidate_result.w = convert_int_rtn(ScaleFactor*WND_SIZE_Y);
|
||||
candidate[index] = candidate_result;
|
||||
}
|
||||
}
|
||||
}//end gpuRunHaarClassifierCascade
|
||||
#else
|
||||
|
||||
__kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCascade(
|
||||
global GpuHidHaarStageClassifier * stagecascadeptr,
|
||||
global int4 * info,
|
||||
@ -421,3 +559,4 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
|
||||
}//end for(int grploop=grpidx;grploop<totalgrp;grploop+=grpnumx)
|
||||
}//end for(int scalei = 0; scalei <loopcount; scalei++)
|
||||
}
|
||||
#endif
|
||||
|
108
modules/ocl/src/opencl/imgproc_sobel3.cl
Normal file
108
modules/ocl/src/opencl/imgproc_sobel3.cl
Normal file
@ -0,0 +1,108 @@
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/////////////////////////////////Macro for border type////////////////////////////////////////////
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
#ifdef BORDER_REPLICATE
|
||||
//BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh
|
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i))
|
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr))
|
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) :(i))
|
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr))
|
||||
#endif
|
||||
|
||||
#ifdef BORDER_REFLECT
|
||||
//BORDER_REFLECT: fedcba|abcdefgh|hgfedcb
|
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i)-1 : (i))
|
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr))
|
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i)-1 : (i))
|
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr))
|
||||
#endif
|
||||
|
||||
#ifdef BORDER_REFLECT101
|
||||
//BORDER_REFLECT101: gfedcb|abcdefgh|gfedcba
|
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i) : (i))
|
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr))
|
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i) : (i))
|
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr))
|
||||
#endif
|
||||
|
||||
#ifdef BORDER_WRAP
|
||||
//BORDER_WRAP: cdefgh|abcdefgh|abcdefg
|
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (i)+(r_edge) : (i))
|
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (i)-(r_edge) : (addr))
|
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (i)+(b_edge) : (i))
|
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (i)-(b_edge) : (addr))
|
||||
#endif
|
||||
|
||||
__kernel void sobel3(
|
||||
__global uchar* Src,
|
||||
__global float* DstX,
|
||||
__global float* DstY,
|
||||
int width, int height,
|
||||
uint srcStride, uint dstStride,
|
||||
float scale
|
||||
)
|
||||
{
|
||||
__local float lsmem[BLK_Y+2][BLK_X+2];
|
||||
|
||||
int lix = get_local_id(0);
|
||||
int liy = get_local_id(1);
|
||||
|
||||
int gix = get_group_id(0);
|
||||
int giy = get_group_id(1);
|
||||
|
||||
int id_x = get_global_id(0);
|
||||
int id_y = get_global_id(1);
|
||||
|
||||
lsmem[liy+1][lix+1] = convert_float(Src[ id_y * srcStride + id_x ]);
|
||||
|
||||
int id_y_h = ADDR_H(id_y-1, 0,height);
|
||||
int id_y_b = ADDR_B(id_y+1, height,id_y+1);
|
||||
|
||||
int id_x_l = ADDR_L(id_x-1, 0,width);
|
||||
int id_x_r = ADDR_R(id_x+1, width,id_x+1);
|
||||
|
||||
if(liy==0)
|
||||
{
|
||||
lsmem[0][lix+1]=convert_float(Src[ id_y_h * srcStride + id_x ]);
|
||||
|
||||
if(lix==0)
|
||||
lsmem[0][0]=convert_float(Src[ id_y_h * srcStride + id_x_l ]);
|
||||
else if(lix==BLK_X-1)
|
||||
lsmem[0][BLK_X+1]=convert_float(Src[ id_y_h * srcStride + id_x_r ]);
|
||||
}
|
||||
else if(liy==BLK_Y-1)
|
||||
{
|
||||
lsmem[BLK_Y+1][lix+1]=convert_float(Src[ id_y_b * srcStride + id_x ]);
|
||||
|
||||
if(lix==0)
|
||||
lsmem[BLK_Y+1][0]=convert_float(Src[ id_y_b * srcStride + id_x_l ]);
|
||||
else if(lix==BLK_X-1)
|
||||
lsmem[BLK_Y+1][BLK_X+1]=convert_float(Src[ id_y_b * srcStride + id_x_r ]);
|
||||
}
|
||||
|
||||
if(lix==0)
|
||||
lsmem[liy+1][0] = convert_float(Src[ id_y * srcStride + id_x_l ]);
|
||||
else if(lix==BLK_X-1)
|
||||
lsmem[liy+1][BLK_X+1] = convert_float(Src[ id_y * srcStride + id_x_r ]);
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
float u1 = lsmem[liy][lix];
|
||||
float u2 = lsmem[liy][lix+1];
|
||||
float u3 = lsmem[liy][lix+2];
|
||||
|
||||
float m1 = lsmem[liy+1][lix];
|
||||
float m2 = lsmem[liy+1][lix+1];
|
||||
float m3 = lsmem[liy+1][lix+2];
|
||||
|
||||
float b1 = lsmem[liy+2][lix];
|
||||
float b2 = lsmem[liy+2][lix+1];
|
||||
float b3 = lsmem[liy+2][lix+2];
|
||||
|
||||
//m2 * scale;//
|
||||
float dx = mad(2.0f, m3 - m1, u3 - u1 + b3 - b1 );
|
||||
DstX[ id_y * dstStride + id_x ] = dx * scale;
|
||||
|
||||
float dy = mad(2.0f, b2 - u2, b1 - u1 + b3 - u3);
|
||||
DstY[ id_y * dstStride + id_x ] = dy * scale;
|
||||
}
|
Loading…
Reference in New Issue
Block a user