Cleanup and addition of 4-component support for ocl_fastNlMeansDenoising
This commit is contained in:
parent
50bb14a0a8
commit
87760d13fb
@ -77,7 +77,7 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
|
|||||||
int ctaSize = ocl::Device::getDefault().isIntel() ? CTA_SIZE_INTEL : CTA_SIZE_DEFAULT;
|
int ctaSize = ocl::Device::getDefault().isIntel() ? CTA_SIZE_INTEL : CTA_SIZE_DEFAULT;
|
||||||
Size size = _src.size();
|
Size size = _src.size();
|
||||||
|
|
||||||
if (cn != 1 && cn != 2 && cn != 3 && depth != CV_8U && (!abs || depth != CV_16U))
|
if (cn != 1 && cn != 2 && cn != 3 && cn != 4 && depth != CV_8U && (!abs || depth != CV_16U))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
int templateWindowHalfWize = templateWindowSize / 2;
|
int templateWindowHalfWize = templateWindowSize / 2;
|
||||||
@ -93,7 +93,7 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
|
|||||||
" -D weight_t=%s -D sum_t=%s -D convert_sum_t=%s"
|
" -D weight_t=%s -D sum_t=%s -D convert_sum_t=%s"
|
||||||
" -D BLOCK_COLS=%d -D BLOCK_ROWS=%d"
|
" -D BLOCK_COLS=%d -D BLOCK_ROWS=%d"
|
||||||
" -D CTA_SIZE=%d -D TEMPLATE_SIZE2=%d -D SEARCH_SIZE2=%d"
|
" -D CTA_SIZE=%d -D TEMPLATE_SIZE2=%d -D SEARCH_SIZE2=%d"
|
||||||
" -D convert_int_t=%s -D cn=%d -D convert_pixel_t=%s%s",
|
" -D convert_int_t=%s -D cn=%d -D psz=%d -D convert_pixel_t=%s%s",
|
||||||
templateWindowSize, searchWindowSize,
|
templateWindowSize, searchWindowSize,
|
||||||
ocl::typeToStr(depth), ocl::typeToStr(type), ocl::typeToStr(CV_32SC(cn)),
|
ocl::typeToStr(depth), ocl::typeToStr(type), ocl::typeToStr(CV_32SC(cn)),
|
||||||
depth == CV_8U ? ocl::typeToStr(CV_32S) : "long",
|
depth == CV_8U ? ocl::typeToStr(CV_32S) : "long",
|
||||||
@ -103,7 +103,7 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
|
|||||||
(sprintf(buf[1], "convert_long%d", cn), buf[1]),
|
(sprintf(buf[1], "convert_long%d", cn), buf[1]),
|
||||||
BLOCK_COLS, BLOCK_ROWS,
|
BLOCK_COLS, BLOCK_ROWS,
|
||||||
ctaSize, templateWindowHalfWize, searchWindowHalfSize,
|
ctaSize, templateWindowHalfWize, searchWindowHalfSize,
|
||||||
ocl::convertTypeStr(depth, CV_32S, cn, buf[2]), cn == 3 ? 4 : cn,
|
ocl::convertTypeStr(depth, CV_32S, cn, buf[2]), cn, cn == 3 ? 4 : cn,
|
||||||
ocl::convertTypeStr(CV_32S, depth, cn, buf[3]), abs ? " -D ABS" : "");
|
ocl::convertTypeStr(CV_32S, depth, cn, buf[3]), abs ? " -D ABS" : "");
|
||||||
|
|
||||||
ocl::Kernel k("fastNlMeansDenoising", ocl::photo::nlmeans_oclsrc, opts);
|
ocl::Kernel k("fastNlMeansDenoising", ocl::photo::nlmeans_oclsrc, opts);
|
||||||
|
@ -60,8 +60,10 @@ inline int calcDist(pixel_t a, pixel_t b)
|
|||||||
return retval;
|
return retval;
|
||||||
#elif cn == 2
|
#elif cn == 2
|
||||||
return retval.x + retval.y;
|
return retval.x + retval.y;
|
||||||
#elif cn == 3 || cn == 4 /* A is ignored */
|
#elif cn == 3
|
||||||
return retval.x + retval.y + retval.z;
|
return retval.x + retval.y + retval.z;
|
||||||
|
#elif cn == 4
|
||||||
|
return retval.x + retval.y + retval.z + retval.w;
|
||||||
#else
|
#else
|
||||||
#error "cn should be either 1, 2, 3 or 4"
|
#error "cn should be either 1, 2, 3 or 4"
|
||||||
#endif
|
#endif
|
||||||
@ -83,8 +85,10 @@ inline int calcDistUpDown(pixel_t down_value, pixel_t down_value_t, pixel_t up_v
|
|||||||
return retval;
|
return retval;
|
||||||
#elif cn == 2
|
#elif cn == 2
|
||||||
return retval.x + retval.y;
|
return retval.x + retval.y;
|
||||||
#elif cn == 3 || cn == 4 /* A is ignored */
|
#elif cn == 3
|
||||||
return retval.x + retval.y + retval.z;
|
return retval.x + retval.y + retval.z;
|
||||||
|
#elif cn == 4
|
||||||
|
return retval.x + retval.y + retval.z + retval.w;
|
||||||
#else
|
#else
|
||||||
#error "cn should be either 1, 2, 3 or 4"
|
#error "cn should be either 1, 2, 3 or 4"
|
||||||
#endif
|
#endif
|
||||||
@ -106,8 +110,8 @@ inline void calcFirstElementInRow(__global const sample_t * src, int src_step, i
|
|||||||
int dist = 0, value;
|
int dist = 0, value;
|
||||||
|
|
||||||
__global const pixel_t * src_template = (__global const pixel_t *)(src +
|
__global const pixel_t * src_template = (__global const pixel_t *)(src +
|
||||||
mad24(sy + i / SEARCH_SIZE, src_step, mad24(cn, sx + i % SEARCH_SIZE, src_offset)));
|
mad24(sy + i / SEARCH_SIZE, src_step, mad24(psz, sx + i % SEARCH_SIZE, src_offset)));
|
||||||
__global const pixel_t * src_current = (__global const pixel_t *)(src + mad24(y, src_step, mad24(cn, x, src_offset)));
|
__global const pixel_t * src_current = (__global const pixel_t *)(src + mad24(y, src_step, mad24(psz, x, src_offset)));
|
||||||
__global int * col_dists_current = col_dists + i * TEMPLATE_SIZE;
|
__global int * col_dists_current = col_dists + i * TEMPLATE_SIZE;
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
@ -148,9 +152,9 @@ inline void calcElementInFirstRow(__global const sample_t * src, int src_step, i
|
|||||||
|
|
||||||
for (int i = id; i < SEARCH_SIZE_SQ; i += CTA_SIZE)
|
for (int i = id; i < SEARCH_SIZE_SQ; i += CTA_SIZE)
|
||||||
{
|
{
|
||||||
__global const pixel_t * src_current = (__global const pixel_t *)(src + mad24(y, src_step, mad24(cn, x, src_offset)));
|
__global const pixel_t * src_current = (__global const pixel_t *)(src + mad24(y, src_step, mad24(psz, x, src_offset)));
|
||||||
__global const pixel_t * src_template = (__global const pixel_t *)(src +
|
__global const pixel_t * src_template = (__global const pixel_t *)(src +
|
||||||
mad24(sy + i / SEARCH_SIZE, src_step, mad24(cn, sx + i % SEARCH_SIZE, src_offset)));
|
mad24(sy + i / SEARCH_SIZE, src_step, mad24(psz, sx + i % SEARCH_SIZE, src_offset)));
|
||||||
__global int * col_dists_current = col_dists + TEMPLATE_SIZE * i;
|
__global int * col_dists_current = col_dists + TEMPLATE_SIZE * i;
|
||||||
|
|
||||||
int col_dist = 0;
|
int col_dist = 0;
|
||||||
@ -178,8 +182,8 @@ inline void calcElement(__global const sample_t * src, int src_step, int src_off
|
|||||||
int sy_up = y - TEMPLATE_SIZE2 - 1;
|
int sy_up = y - TEMPLATE_SIZE2 - 1;
|
||||||
int sy_down = y + TEMPLATE_SIZE2;
|
int sy_down = y + TEMPLATE_SIZE2;
|
||||||
|
|
||||||
pixel_t up_value = *(__global const pixel_t *)(src + mad24(sy_up, src_step, mad24(cn, sx, src_offset)));
|
pixel_t up_value = *(__global const pixel_t *)(src + mad24(sy_up, src_step, mad24(psz, sx, src_offset)));
|
||||||
pixel_t down_value = *(__global const pixel_t *)(src + mad24(sy_down, src_step, mad24(cn, sx, src_offset)));
|
pixel_t down_value = *(__global const pixel_t *)(src + mad24(sy_down, src_step, mad24(psz, sx, src_offset)));
|
||||||
|
|
||||||
sx -= SEARCH_SIZE2;
|
sx -= SEARCH_SIZE2;
|
||||||
sy_up -= SEARCH_SIZE2;
|
sy_up -= SEARCH_SIZE2;
|
||||||
@ -189,8 +193,8 @@ inline void calcElement(__global const sample_t * src, int src_step, int src_off
|
|||||||
{
|
{
|
||||||
int wx = i % SEARCH_SIZE, wy = i / SEARCH_SIZE;
|
int wx = i % SEARCH_SIZE, wy = i / SEARCH_SIZE;
|
||||||
|
|
||||||
pixel_t up_value_t = *(__global const pixel_t *)(src + mad24(sy_up + wy, src_step, mad24(cn, sx + wx, src_offset)));
|
pixel_t up_value_t = *(__global const pixel_t *)(src + mad24(sy_up + wy, src_step, mad24(psz, sx + wx, src_offset)));
|
||||||
pixel_t down_value_t = *(__global const pixel_t *)(src + mad24(sy_down + wy, src_step, mad24(cn, sx + wx, src_offset)));
|
pixel_t down_value_t = *(__global const pixel_t *)(src + mad24(sy_down + wy, src_step, mad24(psz, sx + wx, src_offset)));
|
||||||
|
|
||||||
__global int * col_dists_current = col_dists + mad24(i, TEMPLATE_SIZE, first);
|
__global int * col_dists_current = col_dists + mad24(i, TEMPLATE_SIZE, first);
|
||||||
__global int * up_col_dists_current = up_col_dists + mad24(x0, SEARCH_SIZE_SQ, i);
|
__global int * up_col_dists_current = up_col_dists + mad24(x0, SEARCH_SIZE_SQ, i);
|
||||||
@ -215,7 +219,7 @@ inline void convolveWindow(__global const sample_t * src, int src_step, int src_
|
|||||||
|
|
||||||
for (int i = id; i < SEARCH_SIZE_SQ; i += CTA_SIZE)
|
for (int i = id; i < SEARCH_SIZE_SQ; i += CTA_SIZE)
|
||||||
{
|
{
|
||||||
int src_index = mad24(sy + i / SEARCH_SIZE, src_step, mad24(i % SEARCH_SIZE + sx, cn, src_offset));
|
int src_index = mad24(sy + i / SEARCH_SIZE, src_step, mad24(i % SEARCH_SIZE + sx, psz, src_offset));
|
||||||
sum_t src_value = convert_sum_t(*(__global const pixel_t *)(src + src_index));
|
sum_t src_value = convert_sum_t(*(__global const pixel_t *)(src + src_index));
|
||||||
|
|
||||||
int almostAvgDist = dists[i] >> almostTemplateWindowSizeSqBinShift;
|
int almostAvgDist = dists[i] >> almostTemplateWindowSizeSqBinShift;
|
||||||
@ -242,7 +246,7 @@ inline void convolveWindow(__global const sample_t * src, int src_step, int src_
|
|||||||
|
|
||||||
if (id == 0)
|
if (id == 0)
|
||||||
{
|
{
|
||||||
int dst_index = mad24(y, dst_step, mad24(cn, x, dst_offset));
|
int dst_index = mad24(y, dst_step, mad24(psz, x, dst_offset));
|
||||||
sum_t weighted_sum_local_0 = weighted_sum_local[0] + weighted_sum_local[1] +
|
sum_t weighted_sum_local_0 = weighted_sum_local[0] + weighted_sum_local[1] +
|
||||||
weighted_sum_local[2] + weighted_sum_local[3];
|
weighted_sum_local[2] + weighted_sum_local[3];
|
||||||
weight_t weights_local_0 = weights_local[0] + weights_local[1] + weights_local[2] + weights_local[3];
|
weight_t weights_local_0 = weights_local[0] + weights_local[1] + weights_local[2] + weights_local[3];
|
||||||
|
Loading…
Reference in New Issue
Block a user