Merge pull request #3413 from wangyan42164:denoising_opencl_improvement
This commit is contained in:
commit
a16a11f32b
@ -19,7 +19,8 @@ enum
|
|||||||
{
|
{
|
||||||
BLOCK_ROWS = 32,
|
BLOCK_ROWS = 32,
|
||||||
BLOCK_COLS = 32,
|
BLOCK_COLS = 32,
|
||||||
CTA_SIZE = 256
|
CTA_SIZE_INTEL = 64,
|
||||||
|
CTA_SIZE_DEFAULT = 256
|
||||||
};
|
};
|
||||||
|
|
||||||
static int divUp(int a, int b)
|
static int divUp(int a, int b)
|
||||||
@ -70,6 +71,7 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
|
|||||||
int templateWindowSize, int searchWindowSize)
|
int templateWindowSize, int searchWindowSize)
|
||||||
{
|
{
|
||||||
int type = _src.type(), cn = CV_MAT_CN(type);
|
int type = _src.type(), cn = CV_MAT_CN(type);
|
||||||
|
int ctaSize = ocl::Device::getDefault().isIntel() ? CTA_SIZE_INTEL : CTA_SIZE_DEFAULT;
|
||||||
Size size = _src.size();
|
Size size = _src.size();
|
||||||
|
|
||||||
if ( type != CV_8UC1 && type != CV_8UC2 && type != CV_8UC4 )
|
if ( type != CV_8UC1 && type != CV_8UC2 && type != CV_8UC4 )
|
||||||
@ -86,12 +88,12 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
|
|||||||
String opts = format("-D OP_CALC_FASTNLMEANS -D TEMPLATE_SIZE=%d -D SEARCH_SIZE=%d"
|
String opts = format("-D OP_CALC_FASTNLMEANS -D TEMPLATE_SIZE=%d -D SEARCH_SIZE=%d"
|
||||||
" -D uchar_t=%s -D int_t=%s -D BLOCK_COLS=%d -D BLOCK_ROWS=%d"
|
" -D uchar_t=%s -D int_t=%s -D BLOCK_COLS=%d -D BLOCK_ROWS=%d"
|
||||||
" -D CTA_SIZE=%d -D TEMPLATE_SIZE2=%d -D SEARCH_SIZE2=%d"
|
" -D CTA_SIZE=%d -D TEMPLATE_SIZE2=%d -D SEARCH_SIZE2=%d"
|
||||||
" -D convert_int_t=%s -D cn=%d -D CTA_SIZE2=%d -D convert_uchar_t=%s",
|
" -D convert_int_t=%s -D cn=%d -D convert_uchar_t=%s",
|
||||||
templateWindowSize, searchWindowSize, ocl::typeToStr(type),
|
templateWindowSize, searchWindowSize, ocl::typeToStr(type),
|
||||||
ocl::typeToStr(CV_32SC(cn)), BLOCK_COLS, BLOCK_ROWS, CTA_SIZE,
|
ocl::typeToStr(CV_32SC(cn)), BLOCK_COLS, BLOCK_ROWS, ctaSize,
|
||||||
templateWindowHalfWize, searchWindowHalfSize,
|
templateWindowHalfWize, searchWindowHalfSize,
|
||||||
ocl::convertTypeStr(CV_8U, CV_32S, cn, cvt[0]), cn,
|
ocl::convertTypeStr(CV_8U, CV_32S, cn, cvt[0]), cn,
|
||||||
CTA_SIZE >> 1, ocl::convertTypeStr(CV_32S, CV_8U, cn, cvt[1]));
|
ocl::convertTypeStr(CV_32S, CV_8U, cn, cvt[1]));
|
||||||
|
|
||||||
ocl::Kernel k("fastNlMeansDenoising", ocl::photo::nlmeans_oclsrc, opts);
|
ocl::Kernel k("fastNlMeansDenoising", ocl::photo::nlmeans_oclsrc, opts);
|
||||||
if (k.empty())
|
if (k.empty())
|
||||||
@ -120,7 +122,7 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
|
|||||||
ocl::KernelArg::PtrReadOnly(almostDist2Weight),
|
ocl::KernelArg::PtrReadOnly(almostDist2Weight),
|
||||||
ocl::KernelArg::PtrReadOnly(buffer), almostTemplateWindowSizeSqBinShift);
|
ocl::KernelArg::PtrReadOnly(buffer), almostTemplateWindowSizeSqBinShift);
|
||||||
|
|
||||||
size_t globalsize[2] = { nblocksx * CTA_SIZE, nblocksy }, localsize[2] = { CTA_SIZE, 1 };
|
size_t globalsize[2] = { nblocksx * ctaSize, nblocksy }, localsize[2] = { ctaSize, 1 };
|
||||||
return k.run(2, globalsize, localsize, false);
|
return k.run(2, globalsize, localsize, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -206,22 +206,11 @@ inline void convolveWindow(__global const uchar * src, int src_step, int src_off
|
|||||||
weighted_sum += (int_t)(weight) * src_value;
|
weighted_sum += (int_t)(weight) * src_value;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (id >= CTA_SIZE2)
|
weights_local[id] = weights;
|
||||||
{
|
weighted_sum_local[id] = weighted_sum;
|
||||||
int id2 = id - CTA_SIZE2;
|
|
||||||
weights_local[id2] = weights;
|
|
||||||
weighted_sum_local[id2] = weighted_sum;
|
|
||||||
}
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
if (id < CTA_SIZE2)
|
for (int lsize = CTA_SIZE >> 1; lsize > 2; lsize >>= 1)
|
||||||
{
|
|
||||||
weights_local[id] += weights;
|
|
||||||
weighted_sum_local[id] += weighted_sum;
|
|
||||||
}
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
|
|
||||||
for (int lsize = CTA_SIZE2 >> 1; lsize > 2; lsize >>= 1)
|
|
||||||
{
|
{
|
||||||
if (id < lsize)
|
if (id < lsize)
|
||||||
{
|
{
|
||||||
@ -252,8 +241,8 @@ __kernel void fastNlMeansDenoising(__global const uchar * src, int src_step, int
|
|||||||
int block_y = get_group_id(1);
|
int block_y = get_group_id(1);
|
||||||
int id = get_local_id(0), first;
|
int id = get_local_id(0), first;
|
||||||
|
|
||||||
__local int dists[SEARCH_SIZE_SQ], weights[CTA_SIZE2];
|
__local int dists[SEARCH_SIZE_SQ], weights[CTA_SIZE];
|
||||||
__local int_t weighted_sum[CTA_SIZE2];
|
__local int_t weighted_sum[CTA_SIZE];
|
||||||
|
|
||||||
int x0 = block_x * BLOCK_COLS, x1 = min(x0 + BLOCK_COLS, dst_cols);
|
int x0 = block_x * BLOCK_COLS, x1 = min(x0 + BLOCK_COLS, dst_cols);
|
||||||
int y0 = block_y * BLOCK_ROWS, y1 = min(y0 + BLOCK_ROWS, dst_rows);
|
int y0 = block_y * BLOCK_ROWS, y1 = min(y0 + BLOCK_ROWS, dst_rows);
|
||||||
@ -281,7 +270,6 @@ __kernel void fastNlMeansDenoising(__global const uchar * src, int src_step, int
|
|||||||
|
|
||||||
first = (first + 1) % TEMPLATE_SIZE;
|
first = (first + 1) % TEMPLATE_SIZE;
|
||||||
}
|
}
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
|
|
||||||
convolveWindow(src, src_step, src_offset, dists, almostDist2Weight, dst, dst_step, dst_offset,
|
convolveWindow(src, src_step, src_offset, dists, almostDist2Weight, dst, dst_step, dst_offset,
|
||||||
y, x, id, weights, weighted_sum, almostTemplateWindowSizeSqBinShift);
|
y, x, id, weights, weighted_sum, almostTemplateWindowSizeSqBinShift);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user