Merge pull request #1626 from ilya-lavrenov:ocl_filters
This commit is contained in:
@@ -197,10 +197,10 @@ static void GPUErode(const oclMat &src, oclMat &dst, oclMat &mat_kernel,
|
||||
(src.rows == dst.rows));
|
||||
CV_Assert((src.oclchannels() == dst.oclchannels()));
|
||||
|
||||
int srcStep = src.step1() / src.oclchannels();
|
||||
int dstStep = dst.step1() / dst.oclchannels();
|
||||
int srcOffset = src.offset / src.elemSize();
|
||||
int dstOffset = dst.offset / dst.elemSize();
|
||||
int srcStep = src.step / src.elemSize();
|
||||
int dstStep = dst.step / dst.elemSize();
|
||||
int srcOffset = src.offset / src.elemSize();
|
||||
int dstOffset = dst.offset / dst.elemSize();
|
||||
|
||||
int srcOffset_x = srcOffset % srcStep;
|
||||
int srcOffset_y = srcOffset / srcStep;
|
||||
@@ -247,6 +247,7 @@ static void GPUErode(const oclMat &src, oclMat &dst, oclMat &mat_kernel,
|
||||
sprintf(compile_option, "-D RADIUSX=%d -D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D ERODE %s %s",
|
||||
anchor.x, anchor.y, (int)localThreads[0], (int)localThreads[1],
|
||||
s, rectKernel?"-D RECTKERNEL":"");
|
||||
|
||||
vector< pair<size_t, const void *> > args;
|
||||
args.push_back(make_pair(sizeof(cl_mem), (void *)&src.data));
|
||||
args.push_back(make_pair(sizeof(cl_mem), (void *)&dst.data));
|
||||
@@ -260,6 +261,7 @@ static void GPUErode(const oclMat &src, oclMat &dst, oclMat &mat_kernel,
|
||||
args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholecols));
|
||||
args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholerows));
|
||||
args.push_back(make_pair(sizeof(cl_int), (void *)&dstOffset));
|
||||
|
||||
openCLExecuteKernel(clCxt, &filtering_morph, kernelName, globalThreads, localThreads, args, -1, -1, compile_option);
|
||||
}
|
||||
|
||||
@@ -351,7 +353,7 @@ Ptr<BaseFilter_GPU> cv::ocl::getMorphologyFilter_GPU(int op, int type, const Mat
|
||||
};
|
||||
|
||||
CV_Assert(op == MORPH_ERODE || op == MORPH_DILATE);
|
||||
CV_Assert(type == CV_8UC1 || type == CV_8UC3 || type == CV_8UC4 || type == CV_32FC1 || type == CV_32FC1 || type == CV_32FC4);
|
||||
CV_Assert(type == CV_8UC1 || type == CV_8UC3 || type == CV_8UC4 || type == CV_32FC1 || type == CV_32FC3 || type == CV_32FC4);
|
||||
|
||||
oclMat gpu_krnl;
|
||||
normalizeKernel(kernel, gpu_krnl);
|
||||
@@ -361,9 +363,11 @@ Ptr<BaseFilter_GPU> cv::ocl::getMorphologyFilter_GPU(int op, int type, const Mat
|
||||
for(int i = 0; i < kernel.rows * kernel.cols; ++i)
|
||||
if(kernel.data[i] != 1)
|
||||
noZero = false;
|
||||
MorphFilter_GPU* mfgpu=new MorphFilter_GPU(ksize, anchor, gpu_krnl, GPUMorfFilter_callers[op][CV_MAT_CN(type)]);
|
||||
|
||||
MorphFilter_GPU* mfgpu = new MorphFilter_GPU(ksize, anchor, gpu_krnl, GPUMorfFilter_callers[op][CV_MAT_CN(type)]);
|
||||
if(noZero)
|
||||
mfgpu->rectKernel = true;
|
||||
|
||||
return Ptr<BaseFilter_GPU>(mfgpu);
|
||||
}
|
||||
|
||||
@@ -445,9 +449,7 @@ void morphOp(int op, const oclMat &src, oclMat &dst, const Mat &_kernel, Point a
|
||||
iterations = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
kernel = _kernel;
|
||||
}
|
||||
|
||||
Ptr<FilterEngine_GPU> f = createMorphologyFilter_GPU(op, src.type(), kernel, anchor, iterations);
|
||||
|
||||
@@ -462,14 +464,10 @@ void cv::ocl::erode(const oclMat &src, oclMat &dst, const Mat &kernel, Point anc
|
||||
|
||||
for (int i = 0; i < kernel.rows * kernel.cols; ++i)
|
||||
if (kernel.data[i] != 0)
|
||||
{
|
||||
allZero = false;
|
||||
}
|
||||
|
||||
if (allZero)
|
||||
{
|
||||
kernel.data[0] = 1;
|
||||
}
|
||||
|
||||
morphOp(MORPH_ERODE, src, dst, kernel, anchor, iterations, borderType, borderValue);
|
||||
}
|
||||
@@ -558,7 +556,7 @@ static void GPUFilter2D(const oclMat &src, oclMat &dst, const oclMat &mat_kernel
|
||||
Context *clCxt = src.clCxt;
|
||||
|
||||
int filterWidth = ksize.width;
|
||||
bool ksize_3x3 = filterWidth == 3 && src.type() != CV_32FC4; // CV_32FC4 is not tuned up with filter2d_3x3 kernel
|
||||
bool ksize_3x3 = filterWidth == 3 && src.type() != CV_32FC4 && src.type() != CV_32FC3; // CV_32FC4 is not tuned up with filter2d_3x3 kernel
|
||||
|
||||
string kernelName = ksize_3x3 ? "filter2D_3x3" : "filter2D";
|
||||
|
||||
@@ -649,9 +647,7 @@ Ptr<BaseFilter_GPU> cv::ocl::getLinearFilter_GPU(int srcType, int dstType, const
|
||||
Ptr<FilterEngine_GPU> cv::ocl::createLinearFilter_GPU(int srcType, int dstType, const Mat &kernel, const Point &anchor,
|
||||
int borderType)
|
||||
{
|
||||
|
||||
Size ksize = kernel.size();
|
||||
|
||||
Ptr<BaseFilter_GPU> linearFilter = getLinearFilter_GPU(srcType, dstType, kernel, ksize, anchor, borderType);
|
||||
|
||||
return createFilter2D_GPU(linearFilter);
|
||||
@@ -659,11 +655,8 @@ Ptr<FilterEngine_GPU> cv::ocl::createLinearFilter_GPU(int srcType, int dstType,
|
||||
|
||||
void cv::ocl::filter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &kernel, Point anchor, int borderType)
|
||||
{
|
||||
|
||||
if (ddepth < 0)
|
||||
{
|
||||
ddepth = src.depth();
|
||||
}
|
||||
|
||||
dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels()));
|
||||
|
||||
@@ -1444,9 +1437,7 @@ Ptr<FilterEngine_GPU> cv::ocl::createGaussianFilter_GPU(int type, Size ksize, do
|
||||
int depth = CV_MAT_DEPTH(type);
|
||||
|
||||
if (sigma2 <= 0)
|
||||
{
|
||||
sigma2 = sigma1;
|
||||
}
|
||||
|
||||
// automatic detection of kernel size from sigma
|
||||
if (ksize.width <= 0 && sigma1 > 0)
|
||||
|
||||
@@ -408,20 +408,11 @@ namespace cv
|
||||
void medianFilter(const oclMat &src, oclMat &dst, int m)
|
||||
{
|
||||
CV_Assert( m % 2 == 1 && m > 1 );
|
||||
CV_Assert( m <= 5 || src.depth() == CV_8U );
|
||||
CV_Assert( src.cols <= dst.cols && src.rows <= dst.rows );
|
||||
CV_Assert( (src.depth() == CV_8U || src.depth() == CV_32F) && (src.channels() == 1 || src.channels() == 4));
|
||||
dst.create(src.size(), src.type());
|
||||
|
||||
if (src.data == dst.data)
|
||||
{
|
||||
oclMat src1;
|
||||
src.copyTo(src1);
|
||||
return medianFilter(src1, dst, m);
|
||||
}
|
||||
|
||||
int srcStep = src.step1() / src.oclchannels();
|
||||
int dstStep = dst.step1() / dst.oclchannels();
|
||||
int srcOffset = src.offset / src.oclchannels() / src.elemSize1();
|
||||
int dstOffset = dst.offset / dst.oclchannels() / dst.elemSize1();
|
||||
int srcStep = src.step / src.elemSize(), dstStep = dst.step / dst.elemSize();
|
||||
int srcOffset = src.offset / src.elemSize(), dstOffset = dst.offset / dst.elemSize();
|
||||
|
||||
Context *clCxt = src.clCxt;
|
||||
|
||||
@@ -1518,6 +1509,7 @@ namespace cv
|
||||
float *color_weight = &_color_weight[0];
|
||||
float *space_weight = &_space_weight[0];
|
||||
int *space_ofs = &_space_ofs[0];
|
||||
|
||||
int dst_step_in_pixel = dst.step / dst.elemSize();
|
||||
int dst_offset_in_pixel = dst.offset / dst.elemSize();
|
||||
int temp_step_in_pixel = temp.step / temp.elemSize();
|
||||
@@ -1548,7 +1540,7 @@ namespace cv
|
||||
if ((dst.type() == CV_8UC1) && ((dst.offset & 3) == 0) && ((dst.cols & 3) == 0))
|
||||
{
|
||||
kernelName = "bilateral2";
|
||||
globalThreads[0] = dst.cols / 4;
|
||||
globalThreads[0] = dst.cols >> 2;
|
||||
}
|
||||
|
||||
vector<pair<size_t , const void *> > args;
|
||||
@@ -1566,15 +1558,17 @@ namespace cv
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&oclcolor_weight.data ));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&oclspace_weight.data ));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&oclspace_ofs.data ));
|
||||
|
||||
openCLExecuteKernel(src.clCxt, &imgproc_bilateral, kernelName, globalThreads, localThreads, args, dst.oclchannels(), dst.depth());
|
||||
}
|
||||
|
||||
void bilateralFilter(const oclMat &src, oclMat &dst, int radius, double sigmaclr, double sigmaspc, int borderType)
|
||||
{
|
||||
dst.create( src.size(), src.type() );
|
||||
if ( src.depth() == CV_8U )
|
||||
oclbilateralFilter_8u( src, dst, radius, sigmaclr, sigmaspc, borderType );
|
||||
else
|
||||
CV_Error( CV_StsUnsupportedFormat, "Bilateral filtering is only implemented for 8uimages" );
|
||||
CV_Error( CV_StsUnsupportedFormat, "Bilateral filtering is only implemented for CV_8U images" );
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -169,6 +169,7 @@ __kernel void filter2D(
|
||||
int globalRow = groupStartRow + localRow;
|
||||
const int src_offset = mad24(src_offset_y, src_step, src_offset_x);
|
||||
const int dst_offset = mad24(dst_offset_y, dst_step, dst_offset_x);
|
||||
|
||||
#ifdef BORDER_CONSTANT
|
||||
for(int i = localRow; i < LOCAL_HEIGHT; i += get_local_size(1))
|
||||
{
|
||||
@@ -208,6 +209,7 @@ __kernel void filter2D(
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(globalRow < rows && globalCol < cols)
|
||||
{
|
||||
@@ -231,6 +233,7 @@ __kernel void filter2D(
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/////////////////////////////Macro for define elements number per thread/////////////////////////////
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#define ANX 1
|
||||
#define ANY 1
|
||||
|
||||
@@ -249,6 +252,7 @@ __kernel void filter2D(
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/////////////////////////////////////////8uC1////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
__kernel void filter2D_3x3(
|
||||
__global T_IMG *src,
|
||||
__global T_IMG *dst,
|
||||
@@ -359,6 +363,7 @@ __kernel void filter2D_3x3(
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(dst_rows_index < dst_rows_end)
|
||||
{
|
||||
T_IMGx4 tmp_dst = CONVERT_TYPEx4(sum);
|
||||
|
||||
@@ -45,6 +45,7 @@
|
||||
//BORDER_CONSTANT: iiiiii|abcdefgh|iiiiiii
|
||||
#define ELEM(i,l_edge,r_edge,elem1,elem2) (i)<(l_edge) | (i) >= (r_edge) ? (elem1) : (elem2)
|
||||
#ifndef GENTYPE
|
||||
|
||||
__kernel void morph_C1_D0(__global const uchar * restrict src,
|
||||
__global uchar *dst,
|
||||
int src_offset_x, int src_offset_y,
|
||||
@@ -150,7 +151,9 @@ __kernel void morph_C1_D0(__global const uchar * restrict src,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
__kernel void morph(__global const GENTYPE * restrict src,
|
||||
__global GENTYPE *dst,
|
||||
int src_offset_x, int src_offset_y,
|
||||
@@ -221,4 +224,5 @@ __kernel void morph(__global const GENTYPE * restrict src,
|
||||
dst[out_addr] = res;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -47,25 +47,27 @@ __kernel void bilateral_C1_D0(__global uchar *dst,
|
||||
__constant float *space_weight,
|
||||
__constant int *space_ofs)
|
||||
{
|
||||
int gidx = get_global_id(0);
|
||||
int gidy = get_global_id(1);
|
||||
if((gidy<dst_rows) && (gidx<dst_cols))
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (y < dst_rows && x < dst_cols)
|
||||
{
|
||||
int src_addr = mad24(gidy+radius,src_step,gidx+radius);
|
||||
int dst_addr = mad24(gidy,dst_step,gidx+dst_offset);
|
||||
int src_index = mad24(y + radius, src_step, x + radius);
|
||||
int dst_index = mad24(y, dst_step, x + dst_offset);
|
||||
float sum = 0.f, wsum = 0.f;
|
||||
|
||||
int val0 = (int)src[src_addr];
|
||||
int val0 = (int)src[src_index];
|
||||
for(int k = 0; k < maxk; k++ )
|
||||
{
|
||||
int val = (int)src[src_addr + space_ofs[k]];
|
||||
float w = space_weight[k]*color_weight[abs(val - val0)];
|
||||
sum += (float)(val)*w;
|
||||
int val = (int)src[src_index + space_ofs[k]];
|
||||
float w = space_weight[k] * color_weight[abs(val - val0)];
|
||||
sum += (float)(val) * w;
|
||||
wsum += w;
|
||||
}
|
||||
dst[dst_addr] = convert_uchar_rtz(sum/wsum+0.5f);
|
||||
dst[dst_index] = convert_uchar_rtz(sum / wsum + 0.5f);
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void bilateral2_C1_D0(__global uchar *dst,
|
||||
__global const uchar *src,
|
||||
const int dst_rows,
|
||||
@@ -81,25 +83,28 @@ __kernel void bilateral2_C1_D0(__global uchar *dst,
|
||||
__constant float *space_weight,
|
||||
__constant int *space_ofs)
|
||||
{
|
||||
int gidx = get_global_id(0)<<2;
|
||||
int gidy = get_global_id(1);
|
||||
if((gidy<dst_rows) && (gidx<dst_cols))
|
||||
int x = get_global_id(0) << 2;
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (y < dst_rows && x < dst_cols)
|
||||
{
|
||||
int src_addr = mad24(gidy+radius,src_step,gidx+radius);
|
||||
int dst_addr = mad24(gidy,dst_step,gidx+dst_offset);
|
||||
int src_index = mad24(y + radius, src_step, x + radius);
|
||||
int dst_index = mad24(y, dst_step, x + dst_offset);
|
||||
float4 sum = (float4)(0.f), wsum = (float4)(0.f);
|
||||
|
||||
int4 val0 = convert_int4(vload4(0,src+src_addr));
|
||||
int4 val0 = convert_int4(vload4(0,src + src_index));
|
||||
for(int k = 0; k < maxk; k++ )
|
||||
{
|
||||
int4 val = convert_int4(vload4(0,src+src_addr + space_ofs[k]));
|
||||
float4 w = (float4)(space_weight[k])*(float4)(color_weight[abs(val.x - val0.x)],color_weight[abs(val.y - val0.y)],color_weight[abs(val.z - val0.z)],color_weight[abs(val.w - val0.w)]);
|
||||
sum += convert_float4(val)*w;
|
||||
int4 val = convert_int4(vload4(0,src+src_index + space_ofs[k]));
|
||||
float4 w = (float4)(space_weight[k]) * (float4)(color_weight[abs(val.x - val0.x)], color_weight[abs(val.y - val0.y)],
|
||||
color_weight[abs(val.z - val0.z)], color_weight[abs(val.w - val0.w)]);
|
||||
sum += convert_float4(val) * w;
|
||||
wsum += w;
|
||||
}
|
||||
*(__global uchar4*)(dst+dst_addr) = convert_uchar4_rtz(sum/wsum+0.5f);
|
||||
*(__global uchar4*)(dst+dst_index) = convert_uchar4_rtz(sum/wsum+0.5f);
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void bilateral_C4_D0(__global uchar4 *dst,
|
||||
__global const uchar4 *src,
|
||||
const int dst_rows,
|
||||
@@ -115,24 +120,26 @@ __kernel void bilateral_C4_D0(__global uchar4 *dst,
|
||||
__constant float *space_weight,
|
||||
__constant int *space_ofs)
|
||||
{
|
||||
int gidx = get_global_id(0);
|
||||
int gidy = get_global_id(1);
|
||||
if((gidy<dst_rows) && (gidx<dst_cols))
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (y < dst_rows && x < dst_cols)
|
||||
{
|
||||
int src_addr = mad24(gidy+radius,src_step,gidx+radius);
|
||||
int dst_addr = mad24(gidy,dst_step,gidx+dst_offset);
|
||||
int src_index = mad24(y + radius, src_step, x + radius);
|
||||
int dst_index = mad24(y, dst_step, x + dst_offset);
|
||||
float4 sum = (float4)0.f;
|
||||
float wsum = 0.f;
|
||||
|
||||
int4 val0 = convert_int4(src[src_addr]);
|
||||
int4 val0 = convert_int4(src[src_index]);
|
||||
for(int k = 0; k < maxk; k++ )
|
||||
{
|
||||
int4 val = convert_int4(src[src_addr + space_ofs[k]]);
|
||||
float w = space_weight[k]*color_weight[abs(val.x - val0.x)+abs(val.y - val0.y)+abs(val.z - val0.z)];
|
||||
sum += convert_float4(val)*(float4)w;
|
||||
int4 val = convert_int4(src[src_index + space_ofs[k]]);
|
||||
float w = space_weight[k] * color_weight[abs(val.x - val0.x) + abs(val.y - val0.y) + abs(val.z - val0.z)];
|
||||
sum += convert_float4(val) * (float4)w;
|
||||
wsum += w;
|
||||
}
|
||||
wsum=1.f/wsum;
|
||||
dst[dst_addr] = convert_uchar4_rtz(sum*(float4)wsum+(float4)0.5f);
|
||||
|
||||
wsum = 1.f / wsum;
|
||||
dst[dst_index] = convert_uchar4_rtz(sum * (float4)wsum + (float4)0.5f);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user