Added support for 16-bit input
This commit is contained in:
parent
3bde9e9365
commit
73663dcdd1
@ -28,12 +28,14 @@ static int divUp(int a, int b)
|
||||
return (a + b - 1) / b;
|
||||
}
|
||||
|
||||
template <typename FT>
|
||||
template <typename FT, typename ST, typename WT>
|
||||
static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight, int searchWindowSize, int templateWindowSize, FT h, int cn,
|
||||
int & almostTemplateWindowSizeSqBinShift, bool abs)
|
||||
{
|
||||
const int maxEstimateSumValue = searchWindowSize * searchWindowSize * 255;
|
||||
int fixedPointMult = std::numeric_limits<int>::max() / maxEstimateSumValue;
|
||||
const WT maxEstimateSumValue = searchWindowSize * searchWindowSize *
|
||||
std::numeric_limits<ST>::max();
|
||||
int fixedPointMult = (int)std::min<WT>(std::numeric_limits<WT>::max() / maxEstimateSumValue,
|
||||
std::numeric_limits<int>::max());
|
||||
int depth = DataType<FT>::depth;
|
||||
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
|
||||
|
||||
@ -48,7 +50,8 @@ static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight, int searchWindow
|
||||
FT almostDist2ActualDistMultiplier = (FT)(1 << almostTemplateWindowSizeSqBinShift) / templateWindowSizeSq;
|
||||
|
||||
const FT WEIGHT_THRESHOLD = 1e-3f;
|
||||
int maxDist = abs ? 255 * cn : 255 * 255 * cn;
|
||||
int maxDist = abs ? std::numeric_limits<ST>::max() * cn :
|
||||
std::numeric_limits<ST>::max() * std::numeric_limits<ST>::max() * cn;
|
||||
int almostMaxDist = (int)(maxDist / almostDist2ActualDistMultiplier + 1);
|
||||
FT den = 1.0f / (h * h * cn);
|
||||
|
||||
@ -74,7 +77,7 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
|
||||
int ctaSize = ocl::Device::getDefault().isIntel() ? CTA_SIZE_INTEL : CTA_SIZE_DEFAULT;
|
||||
Size size = _src.size();
|
||||
|
||||
if ( type != CV_8UC1 && type != CV_8UC2 && type != CV_8UC3 )
|
||||
if (cn != 1 && cn != 2 && cn != 3 && depth != CV_8U && (!abs || depth != CV_16U))
|
||||
return false;
|
||||
|
||||
int templateWindowHalfWize = templateWindowSize / 2;
|
||||
@ -84,45 +87,60 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
|
||||
int nblocksx = divUp(size.width, BLOCK_COLS), nblocksy = divUp(size.height, BLOCK_ROWS);
|
||||
int almostTemplateWindowSizeSqBinShift = -1;
|
||||
|
||||
char cvt[2][40];
|
||||
char buf[4][40];
|
||||
String opts = format("-D OP_CALC_FASTNLMEANS -D TEMPLATE_SIZE=%d -D SEARCH_SIZE=%d"
|
||||
" -D sample_t=%s -D pixel_t=%s -D int_t=%s"
|
||||
" -D weight_t=%s -D sum_t=%s -D convert_sum_t=%s"
|
||||
" -D BLOCK_COLS=%d -D BLOCK_ROWS=%d"
|
||||
" -D CTA_SIZE=%d -D TEMPLATE_SIZE2=%d -D SEARCH_SIZE2=%d"
|
||||
" -D convert_int_t=%s -D cn=%d -D convert_pixel_t=%s%s",
|
||||
templateWindowSize, searchWindowSize,
|
||||
ocl::typeToStr(depth), ocl::typeToStr(type), ocl::typeToStr(CV_32SC(cn)),
|
||||
depth == CV_8U ? ocl::typeToStr(CV_32S) : "long",
|
||||
depth == CV_8U ? ocl::typeToStr(CV_32SC(cn)) :
|
||||
(sprintf(buf[0], "long%d", cn), buf[0]),
|
||||
depth == CV_8U ? ocl::convertTypeStr(depth, CV_32S, cn, buf[1]) :
|
||||
(sprintf(buf[1], "convert_long%d", cn), buf[1]),
|
||||
BLOCK_COLS, BLOCK_ROWS,
|
||||
ctaSize, templateWindowHalfWize, searchWindowHalfSize,
|
||||
ocl::convertTypeStr(CV_8U, CV_32S, cn, cvt[0]), type == CV_8UC3 ? 4 : cn,
|
||||
ocl::convertTypeStr(CV_32S, CV_8U, cn, cvt[1]), abs ? " -D ABS" : "");
|
||||
ocl::convertTypeStr(depth, CV_32S, cn, buf[2]), cn == 3 ? 4 : cn,
|
||||
ocl::convertTypeStr(CV_32S, depth, cn, buf[3]), abs ? " -D ABS" : "");
|
||||
|
||||
ocl::Kernel k("fastNlMeansDenoising", ocl::photo::nlmeans_oclsrc, opts);
|
||||
if (k.empty())
|
||||
return false;
|
||||
|
||||
UMat almostDist2Weight;
|
||||
if (!ocl_calcAlmostDist2Weight<float>(almostDist2Weight, searchWindowSize, templateWindowSize,
|
||||
h, cn, almostTemplateWindowSizeSqBinShift, abs))
|
||||
if ((depth == CV_8U &&
|
||||
!ocl_calcAlmostDist2Weight<float, uchar, int>(almostDist2Weight,
|
||||
searchWindowSize, templateWindowSize,
|
||||
h, cn,
|
||||
almostTemplateWindowSizeSqBinShift,
|
||||
abs)) ||
|
||||
(depth == CV_16U &&
|
||||
!ocl_calcAlmostDist2Weight<float, ushort, int64>(almostDist2Weight,
|
||||
searchWindowSize, templateWindowSize,
|
||||
h, cn,
|
||||
almostTemplateWindowSizeSqBinShift,
|
||||
abs)))
|
||||
return false;
|
||||
CV_Assert(almostTemplateWindowSizeSqBinShift >= 0);
|
||||
|
||||
UMat srcex;
|
||||
int borderSize = searchWindowHalfSize + templateWindowHalfWize;
|
||||
if (type == CV_8UC3) {
|
||||
Mat src_rgb = _src.getMat(), src_rgba(size, CV_8UC4);
|
||||
if (cn == 3) {
|
||||
UMat tmp(size, CV_MAKE_TYPE(depth, 4));
|
||||
int from_to[] = { 0,0, 1,1, 2,2 };
|
||||
mixChannels(&src_rgb, 1, &src_rgba, 1, from_to, 3);
|
||||
copyMakeBorder(src_rgba, srcex,
|
||||
borderSize, borderSize, borderSize, borderSize, BORDER_DEFAULT);
|
||||
mixChannels(std::vector<UMat>(1, _src.getUMat()), std::vector<UMat>(1, tmp), from_to, 3);
|
||||
copyMakeBorder(tmp, srcex, borderSize, borderSize, borderSize, borderSize, BORDER_DEFAULT);
|
||||
}
|
||||
else
|
||||
copyMakeBorder(_src, srcex, borderSize, borderSize, borderSize, borderSize, BORDER_DEFAULT);
|
||||
|
||||
_dst.create(size, type);
|
||||
UMat dst;
|
||||
if (type == CV_8UC3)
|
||||
dst.create(size, CV_8UC4);
|
||||
if (cn == 3)
|
||||
dst.create(size, CV_MAKE_TYPE(depth, 4));
|
||||
else
|
||||
dst = _dst.getUMat();
|
||||
|
||||
@ -139,10 +157,9 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
|
||||
size_t globalsize[2] = { nblocksx * ctaSize, nblocksy }, localsize[2] = { ctaSize, 1 };
|
||||
if (!k.run(2, globalsize, localsize, false)) return false;
|
||||
|
||||
if (type == CV_8UC3) {
|
||||
Mat dst_rgba = dst.getMat(ACCESS_READ), dst_rgb = _dst.getMat();
|
||||
if (cn == 3) {
|
||||
int from_to[] = { 0,0, 1,1, 2,2 };
|
||||
mixChannels(&dst_rgba, 1, &dst_rgb, 1, from_to, 3);
|
||||
mixChannels(std::vector<UMat>(1, dst), std::vector<UMat>(1, _dst.getUMat()), from_to, 3);
|
||||
}
|
||||
|
||||
return true;
|
||||
|
@ -206,22 +206,23 @@ inline void calcElement(__global const sample_t * src, int src_step, int src_off
|
||||
inline void convolveWindow(__global const sample_t * src, int src_step, int src_offset,
|
||||
__local int * dists, __global const int * almostDist2Weight,
|
||||
__global sample_t * dst, int dst_step, int dst_offset,
|
||||
int y, int x, int id, __local int * weights_local,
|
||||
__local int_t * weighted_sum_local, int almostTemplateWindowSizeSqBinShift)
|
||||
int y, int x, int id, __local weight_t * weights_local,
|
||||
__local sum_t * weighted_sum_local, int almostTemplateWindowSizeSqBinShift)
|
||||
{
|
||||
int sx = x - SEARCH_SIZE2, sy = y - SEARCH_SIZE2, weights = 0;
|
||||
int_t weighted_sum = (int_t)(0);
|
||||
int sx = x - SEARCH_SIZE2, sy = y - SEARCH_SIZE2;
|
||||
weight_t weights = 0;
|
||||
sum_t weighted_sum = (sum_t)(0);
|
||||
|
||||
for (int i = id; i < SEARCH_SIZE_SQ; i += CTA_SIZE)
|
||||
{
|
||||
int src_index = mad24(sy + i / SEARCH_SIZE, src_step, mad24(i % SEARCH_SIZE + sx, cn, src_offset));
|
||||
int_t src_value = convert_int_t(*(__global const pixel_t *)(src + src_index));
|
||||
sum_t src_value = convert_sum_t(*(__global const pixel_t *)(src + src_index));
|
||||
|
||||
int almostAvgDist = dists[i] >> almostTemplateWindowSizeSqBinShift;
|
||||
int weight = almostDist2Weight[almostAvgDist];
|
||||
|
||||
weights += weight;
|
||||
weighted_sum += (int_t)(weight) * src_value;
|
||||
weights += (weight_t)weight;
|
||||
weighted_sum += (sum_t)(weight) * src_value;
|
||||
}
|
||||
|
||||
weights_local[id] = weights;
|
||||
@ -242,11 +243,11 @@ inline void convolveWindow(__global const sample_t * src, int src_step, int src_
|
||||
if (id == 0)
|
||||
{
|
||||
int dst_index = mad24(y, dst_step, mad24(cn, x, dst_offset));
|
||||
int_t weighted_sum_local_0 = weighted_sum_local[0] + weighted_sum_local[1] +
|
||||
sum_t weighted_sum_local_0 = weighted_sum_local[0] + weighted_sum_local[1] +
|
||||
weighted_sum_local[2] + weighted_sum_local[3];
|
||||
int weights_local_0 = weights_local[0] + weights_local[1] + weights_local[2] + weights_local[3];
|
||||
weight_t weights_local_0 = weights_local[0] + weights_local[1] + weights_local[2] + weights_local[3];
|
||||
|
||||
*(__global pixel_t *)(dst + dst_index) = convert_pixel_t(weighted_sum_local_0 / (int_t)(weights_local_0));
|
||||
*(__global pixel_t *)(dst + dst_index) = convert_pixel_t(weighted_sum_local_0 / (sum_t)(weights_local_0));
|
||||
}
|
||||
}
|
||||
|
||||
@ -259,8 +260,9 @@ __kernel void fastNlMeansDenoising(__global const sample_t * src, int src_step,
|
||||
int block_y = get_group_id(1);
|
||||
int id = get_local_id(0), first;
|
||||
|
||||
__local int dists[SEARCH_SIZE_SQ], weights[CTA_SIZE];
|
||||
__local int_t weighted_sum[CTA_SIZE];
|
||||
__local int dists[SEARCH_SIZE_SQ];
|
||||
__local weight_t weights[CTA_SIZE];
|
||||
__local sum_t weighted_sum[CTA_SIZE];
|
||||
|
||||
int x0 = block_x * BLOCK_COLS, x1 = min(x0 + BLOCK_COLS, dst_cols);
|
||||
int y0 = block_y * BLOCK_ROWS, y1 = min(y0 + BLOCK_ROWS, dst_rows);
|
||||
@ -271,6 +273,11 @@ __kernel void fastNlMeansDenoising(__global const sample_t * src, int src_step,
|
||||
__global int * col_dists = (__global int *)(buffer + block_data_start * sizeof(int));
|
||||
__global int * up_col_dists = col_dists + SEARCH_SIZE_SQ * TEMPLATE_SIZE;
|
||||
|
||||
src_step /= sizeof(sample_t);
|
||||
src_offset /= sizeof(sample_t);
|
||||
dst_step /= sizeof(sample_t);
|
||||
dst_offset /= sizeof(sample_t);
|
||||
|
||||
for (int y = y0; y < y1; ++y)
|
||||
for (int x = x0; x < x1; ++x)
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user