Merge pull request #2560 from akarsakov:gaussianblur_integer
This commit is contained in:
commit
20aaa8fe77
@ -3280,12 +3280,15 @@ static bool ocl_filter2D( InputArray _src, OutputArray _dst, int ddepth,
|
||||
return k.run(2, globalsize, localsize, false);
|
||||
}
|
||||
|
||||
const int shift_bits = 8;
|
||||
|
||||
static bool ocl_sepRowFilter2D(const UMat & src, UMat & buf, const Mat & kernelX, int anchor,
|
||||
int borderType, int ddepth, bool fast8uc1)
|
||||
int borderType, int ddepth, bool fast8uc1, bool int_arithm)
|
||||
{
|
||||
int type = src.type(), cn = CV_MAT_CN(type), sdepth = CV_MAT_DEPTH(type);
|
||||
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
|
||||
Size bufSize = buf.size();
|
||||
int buf_type = buf.type(), bdepth = CV_MAT_DEPTH(buf_type);
|
||||
|
||||
if (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F))
|
||||
return false;
|
||||
@ -3313,15 +3316,16 @@ static bool ocl_sepRowFilter2D(const UMat & src, UMat & buf, const Mat & kernelX
|
||||
|
||||
char cvt[40];
|
||||
cv::String build_options = cv::format("-D RADIUSX=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D %s -D %s"
|
||||
" -D srcT=%s -D dstT=%s -D convertToDstT=%s -D srcT1=%s -D dstT1=%s%s",
|
||||
" -D srcT=%s -D dstT=%s -D convertToDstT=%s -D srcT1=%s -D dstT1=%s%s%s",
|
||||
radiusX, (int)localsize[0], (int)localsize[1], cn, btype,
|
||||
extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION",
|
||||
isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED",
|
||||
ocl::typeToStr(type), ocl::typeToStr(CV_32FC(cn)),
|
||||
ocl::convertTypeStr(sdepth, CV_32F, cn, cvt),
|
||||
ocl::typeToStr(sdepth), ocl::typeToStr(CV_32F),
|
||||
doubleSupport ? " -D DOUBLE_SUPPORT" : "");
|
||||
build_options += ocl::kernelToStr(kernelX, CV_32F);
|
||||
ocl::typeToStr(type), ocl::typeToStr(buf_type),
|
||||
ocl::convertTypeStr(sdepth, bdepth, cn, cvt),
|
||||
ocl::typeToStr(sdepth), ocl::typeToStr(bdepth),
|
||||
doubleSupport ? " -D DOUBLE_SUPPORT" : "",
|
||||
int_arithm ? " -D INTEGER_ARITHMETIC" : "");
|
||||
build_options += ocl::kernelToStr(kernelX, bdepth);
|
||||
|
||||
Size srcWholeSize; Point srcOffset;
|
||||
src.locateROI(srcWholeSize, srcOffset);
|
||||
@ -3348,7 +3352,7 @@ static bool ocl_sepRowFilter2D(const UMat & src, UMat & buf, const Mat & kernelX
|
||||
return k.run(2, globalsize, localsize, false);
|
||||
}
|
||||
|
||||
static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY, double delta, int anchor)
|
||||
static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY, double delta, int anchor, bool int_arithm)
|
||||
{
|
||||
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
|
||||
if (dst.depth() == CV_64F && !doubleSupport)
|
||||
@ -3363,6 +3367,7 @@ static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY
|
||||
|
||||
int dtype = dst.type(), cn = CV_MAT_CN(dtype), ddepth = CV_MAT_DEPTH(dtype);
|
||||
Size sz = dst.size();
|
||||
int buf_type = buf.type(), bdepth = CV_MAT_DEPTH(buf_type);
|
||||
|
||||
globalsize[1] = DIVUP(sz.height, localsize[1]) * localsize[1];
|
||||
globalsize[0] = DIVUP(sz.width, localsize[0]) * localsize[0];
|
||||
@ -3370,13 +3375,14 @@ static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY
|
||||
char cvt[40];
|
||||
cv::String build_options = cv::format("-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d"
|
||||
" -D srcT=%s -D dstT=%s -D convertToDstT=%s"
|
||||
" -D srcT1=%s -D dstT1=%s%s",
|
||||
" -D srcT1=%s -D dstT1=%s -D SHIFT_BITS=%d%s%s",
|
||||
anchor, (int)localsize[0], (int)localsize[1], cn,
|
||||
ocl::typeToStr(buf.type()), ocl::typeToStr(dtype),
|
||||
ocl::convertTypeStr(CV_32F, ddepth, cn, cvt),
|
||||
ocl::typeToStr(CV_32F), ocl::typeToStr(ddepth),
|
||||
doubleSupport ? " -D DOUBLE_SUPPORT" : "");
|
||||
build_options += ocl::kernelToStr(kernelY, CV_32F);
|
||||
ocl::typeToStr(buf_type), ocl::typeToStr(dtype),
|
||||
ocl::convertTypeStr(bdepth, ddepth, cn, cvt),
|
||||
ocl::typeToStr(bdepth), ocl::typeToStr(ddepth),
|
||||
2*shift_bits, doubleSupport ? " -D DOUBLE_SUPPORT" : "",
|
||||
int_arithm ? " -D INTEGER_ARITHMETIC" : "");
|
||||
build_options += ocl::kernelToStr(kernelY, bdepth);
|
||||
|
||||
ocl::Kernel k("col_filter", cv::ocl::imgproc::filterSepCol_oclsrc,
|
||||
build_options);
|
||||
@ -3393,12 +3399,12 @@ const int optimizedSepFilterLocalSize = 16;
|
||||
|
||||
static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst,
|
||||
Mat row_kernel, Mat col_kernel,
|
||||
double delta, int borderType, int ddepth)
|
||||
double delta, int borderType, int ddepth, int bdepth, bool int_arithm)
|
||||
{
|
||||
Size size = _src.size(), wholeSize;
|
||||
Point origin;
|
||||
int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype),
|
||||
esz = CV_ELEM_SIZE(stype), wdepth = std::max(std::max(sdepth, ddepth), CV_32F),
|
||||
esz = CV_ELEM_SIZE(stype), wdepth = std::max(std::max(sdepth, ddepth), bdepth),
|
||||
dtype = CV_MAKE_TYPE(ddepth, cn);
|
||||
size_t src_step = _src.step(), src_offset = _src.offset();
|
||||
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
|
||||
@ -3418,14 +3424,15 @@ static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst,
|
||||
|
||||
String opts = cv::format("-D BLK_X=%d -D BLK_Y=%d -D RADIUSX=%d -D RADIUSY=%d%s%s"
|
||||
" -D srcT=%s -D convertToWT=%s -D WT=%s -D dstT=%s -D convertToDstT=%s"
|
||||
" -D %s -D srcT1=%s -D dstT1=%s -D CN=%d", (int)lt2[0], (int)lt2[1],
|
||||
row_kernel.cols / 2, col_kernel.cols / 2,
|
||||
ocl::kernelToStr(row_kernel, CV_32F, "KERNEL_MATRIX_X").c_str(),
|
||||
ocl::kernelToStr(col_kernel, CV_32F, "KERNEL_MATRIX_Y").c_str(),
|
||||
" -D %s -D srcT1=%s -D dstT1=%s -D WT1=%s -D CN=%d -D SHIFT_BITS=%d%s",
|
||||
(int)lt2[0], (int)lt2[1], row_kernel.cols / 2, col_kernel.cols / 2,
|
||||
ocl::kernelToStr(row_kernel, wdepth, "KERNEL_MATRIX_X").c_str(),
|
||||
ocl::kernelToStr(col_kernel, wdepth, "KERNEL_MATRIX_Y").c_str(),
|
||||
ocl::typeToStr(stype), ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]),
|
||||
ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), ocl::typeToStr(dtype),
|
||||
ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), borderMap[borderType],
|
||||
ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), cn);
|
||||
ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), ocl::typeToStr(wdepth),
|
||||
cn, 2*shift_bits, int_arithm ? " -D INTEGER_ARITHMETIC" : "");
|
||||
|
||||
ocl::Kernel k("sep_filter", ocl::imgproc::filterSep_singlePass_oclsrc, opts);
|
||||
if (k.empty())
|
||||
@ -3468,19 +3475,37 @@ static bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth,
|
||||
if (ddepth < 0)
|
||||
ddepth = sdepth;
|
||||
|
||||
CV_OCL_RUN_(kernelY.cols <= 21 && kernelX.cols <= 21 &&
|
||||
imgSize.width > optimizedSepFilterLocalSize + (kernelX.cols >> 1) &&
|
||||
imgSize.height > optimizedSepFilterLocalSize + (kernelY.cols >> 1) &&
|
||||
(!(borderType & BORDER_ISOLATED) || _src.offset() == 0) && anchor == Point(-1, -1) &&
|
||||
(d.isIntel() || (d.isAMD() && !d.hostUnifiedMemory())),
|
||||
ocl_sepFilter2D_SinglePass(_src, _dst, kernelX, kernelY, delta,
|
||||
borderType & ~BORDER_ISOLATED, ddepth), true)
|
||||
|
||||
if (anchor.x < 0)
|
||||
anchor.x = kernelX.cols >> 1;
|
||||
if (anchor.y < 0)
|
||||
anchor.y = kernelY.cols >> 1;
|
||||
|
||||
int rtype = getKernelType(kernelX,
|
||||
kernelX.rows == 1 ? Point(anchor.x, 0) : Point(0, anchor.x));
|
||||
int ctype = getKernelType(kernelY,
|
||||
kernelY.rows == 1 ? Point(anchor.y, 0) : Point(0, anchor.y));
|
||||
|
||||
int bdepth = CV_32F;
|
||||
bool int_arithm = false;
|
||||
if( sdepth == CV_8U && ddepth == CV_8U &&
|
||||
rtype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL &&
|
||||
ctype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL)
|
||||
{
|
||||
bdepth = CV_32S;
|
||||
kernelX.convertTo( kernelX, bdepth, 1 << shift_bits );
|
||||
kernelY.convertTo( kernelY, bdepth, 1 << shift_bits );
|
||||
int_arithm = true;
|
||||
}
|
||||
|
||||
CV_OCL_RUN_(kernelY.cols <= 21 && kernelX.cols <= 21 &&
|
||||
imgSize.width > optimizedSepFilterLocalSize + anchor.x &&
|
||||
imgSize.height > optimizedSepFilterLocalSize + anchor.y &&
|
||||
(!(borderType & BORDER_ISOLATED) || _src.offset() == 0) &&
|
||||
anchor == Point(kernelX.cols >> 1, kernelY.cols >> 1) &&
|
||||
(d.isIntel() || (d.isAMD() && !d.hostUnifiedMemory())),
|
||||
ocl_sepFilter2D_SinglePass(_src, _dst, kernelX, kernelY, delta,
|
||||
borderType & ~BORDER_ISOLATED, ddepth, bdepth, int_arithm), true)
|
||||
|
||||
UMat src = _src.getUMat();
|
||||
Size srcWholeSize; Point srcOffset;
|
||||
src.locateROI(srcWholeSize, srcOffset);
|
||||
@ -3490,14 +3515,14 @@ static bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth,
|
||||
|
||||
Size srcSize = src.size();
|
||||
Size bufSize(srcSize.width, srcSize.height + kernelY.cols - 1);
|
||||
UMat buf(bufSize, CV_32FC(cn));
|
||||
if (!ocl_sepRowFilter2D(src, buf, kernelX, anchor.x, borderType, ddepth, fast8uc1))
|
||||
UMat buf(bufSize, CV_MAKETYPE(bdepth, cn));
|
||||
if (!ocl_sepRowFilter2D(src, buf, kernelX, anchor.x, borderType, ddepth, fast8uc1, int_arithm))
|
||||
return false;
|
||||
|
||||
_dst.create(srcSize, CV_MAKETYPE(ddepth, cn));
|
||||
UMat dst = _dst.getUMat();
|
||||
|
||||
return ocl_sepColFilter2D(buf, dst, kernelY, delta, anchor.y);
|
||||
return ocl_sepColFilter2D(buf, dst, kernelY, delta, anchor.y, int_arithm);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -3,6 +3,7 @@
|
||||
//
|
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
|
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
||||
// Copyright (C) 2014, Itseez, Inc, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// @Authors
|
||||
@ -60,7 +61,7 @@
|
||||
#endif
|
||||
|
||||
#define DIG(a) a,
|
||||
__constant float mat_kernel[] = { COEFF };
|
||||
__constant srcT1 mat_kernel[] = { COEFF };
|
||||
|
||||
__kernel void col_filter(__global const uchar * src, int src_step, int src_offset, int src_whole_rows, int src_whole_cols,
|
||||
__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, float delta)
|
||||
@ -96,9 +97,17 @@ __kernel void col_filter(__global const uchar * src, int src_step, int src_offse
|
||||
{
|
||||
temp[0] = LDS_DAT[l_y + RADIUSY - i][l_x];
|
||||
temp[1] = LDS_DAT[l_y + RADIUSY + i][l_x];
|
||||
#ifndef INTEGER_ARITHMETIC
|
||||
sum += mad(temp[0], mat_kernel[RADIUSY - i], temp[1] * mat_kernel[RADIUSY + i]);
|
||||
#else
|
||||
sum += mad24(temp[0],mat_kernel[RADIUSY - i], temp[1] * mat_kernel[RADIUSY + i]);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef INTEGER_ARITHMETIC
|
||||
sum = (sum + (1 << (SHIFT_BITS-1))) >> SHIFT_BITS;
|
||||
#endif
|
||||
|
||||
// write the result to dst
|
||||
if (x < dst_cols && y < dst_rows)
|
||||
{
|
||||
|
@ -3,6 +3,7 @@
|
||||
//
|
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
|
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
||||
// Copyright (C) 2014, Itseez, Inc, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// @Authors
|
||||
@ -138,7 +139,15 @@
|
||||
#endif
|
||||
|
||||
#define DIG(a) a,
|
||||
__constant float mat_kernel[] = { COEFF };
|
||||
__constant dstT1 mat_kernel[] = { COEFF };
|
||||
|
||||
#ifndef INTEGER_ARITHMETIC
|
||||
#define dstT4 float4
|
||||
#define convertDstVec convert_float4
|
||||
#else
|
||||
#define dstT4 int4
|
||||
#define convertDstVec convert_int4
|
||||
#endif
|
||||
|
||||
__kernel void row_filter_C1_D0(__global const uchar * src, int src_step_in_pixel, int src_offset_x, int src_offset_y,
|
||||
int src_cols, int src_rows, int src_whole_cols, int src_whole_rows,
|
||||
@ -155,7 +164,7 @@ __kernel void row_filter_C1_D0(__global const uchar * src, int src_step_in_pixel
|
||||
int start_y = y + src_offset_y - radiusy;
|
||||
int start_addr = mad24(start_y, src_step_in_pixel, start_x);
|
||||
|
||||
float4 sum;
|
||||
dstT4 sum;
|
||||
uchar4 temp[READ_TIMES_ROW];
|
||||
|
||||
__local uchar4 LDS_DAT[LSIZE1][READ_TIMES_ROW * LSIZE0 + 1];
|
||||
@ -249,19 +258,23 @@ __kernel void row_filter_C1_D0(__global const uchar * src, int src_step_in_pixel
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// read pixels from lds and calculate the result
|
||||
sum = convert_float4(vload4(0,(__local uchar *)&LDS_DAT[l_y][l_x]+RADIUSX+offset)) * mat_kernel[RADIUSX];
|
||||
sum = convertDstVec(vload4(0,(__local uchar *)&LDS_DAT[l_y][l_x]+RADIUSX+offset)) * mat_kernel[RADIUSX];
|
||||
for (int i = 1; i <= RADIUSX; ++i)
|
||||
{
|
||||
temp[0] = vload4(0, (__local uchar*)&LDS_DAT[l_y][l_x] + RADIUSX + offset - i);
|
||||
temp[1] = vload4(0, (__local uchar*)&LDS_DAT[l_y][l_x] + RADIUSX + offset + i);
|
||||
sum += mad(convert_float4(temp[0]), mat_kernel[RADIUSX-i], convert_float4(temp[1]) * mat_kernel[RADIUSX + i]);
|
||||
#ifndef INTEGER_ARITHMETIC
|
||||
sum += mad(convertDstVec(temp[0]), mat_kernel[RADIUSX-i], convertDstVec(temp[1]) * mat_kernel[RADIUSX + i]);
|
||||
#else
|
||||
sum += mad24(convertDstVec(temp[0]), mat_kernel[RADIUSX-i], convertDstVec(temp[1]) * mat_kernel[RADIUSX + i]);
|
||||
#endif
|
||||
}
|
||||
|
||||
start_addr = mad24(y, dst_step_in_pixel, x);
|
||||
|
||||
// write the result to dst
|
||||
if ((x+3<dst_cols) & (y<dst_rows))
|
||||
*(__global float4*)&dst[start_addr] = sum;
|
||||
*(__global dstT4*)&dst[start_addr] = sum;
|
||||
else if ((x+2<dst_cols) && (y<dst_rows))
|
||||
{
|
||||
dst[start_addr] = sum.x;
|
||||
@ -355,7 +368,11 @@ __kernel void row_filter(__global const uchar * src, int src_step, int src_offse
|
||||
{
|
||||
temp[0] = LDS_DAT[l_y][l_x + RADIUSX - i];
|
||||
temp[1] = LDS_DAT[l_y][l_x + RADIUSX + i];
|
||||
#ifndef INTEGER_ARITHMETIC
|
||||
sum += mad(convertToDstT(temp[0]), mat_kernel[RADIUSX - i], convertToDstT(temp[1]) * mat_kernel[RADIUSX + i]);
|
||||
#else
|
||||
sum += mad24(convertToDstT(temp[0]), mat_kernel[RADIUSX - i], convertToDstT(temp[1]) * mat_kernel[RADIUSX + i]);
|
||||
#endif
|
||||
}
|
||||
|
||||
// write the result to dst
|
||||
|
@ -100,8 +100,8 @@
|
||||
// horizontal and vertical filter kernels
|
||||
// should be defined on host during compile time to avoid overhead
|
||||
#define DIG(a) a,
|
||||
__constant float mat_kernelX[] = { KERNEL_MATRIX_X };
|
||||
__constant float mat_kernelY[] = { KERNEL_MATRIX_Y };
|
||||
__constant WT1 mat_kernelX[] = { KERNEL_MATRIX_X };
|
||||
__constant WT1 mat_kernelY[] = { KERNEL_MATRIX_Y };
|
||||
|
||||
__kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int srcOffsetY, int height, int width,
|
||||
__global uchar* Dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, float delta)
|
||||
@ -124,8 +124,6 @@ __kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int
|
||||
// calculate pixel position in source image taking image offset into account
|
||||
int srcX = x + srcOffsetX - RADIUSX;
|
||||
int srcY = y + srcOffsetY - RADIUSY;
|
||||
int xb = srcX;
|
||||
int yb = srcY;
|
||||
|
||||
// extrapolate coordinates, if needed
|
||||
// and read my own source pixel into local memory
|
||||
@ -159,12 +157,16 @@ __kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int
|
||||
// do vertical filter pass
|
||||
// and store intermediate results to second local memory array
|
||||
int i, clocX = lix;
|
||||
WT sum = 0.0f;
|
||||
WT sum = (WT) 0;
|
||||
do
|
||||
{
|
||||
sum = 0.0f;
|
||||
sum = (WT) 0;
|
||||
for (i=0; i<=2*RADIUSY; i++)
|
||||
#ifndef INTEGER_ARITHMETIC
|
||||
sum = mad(lsmem[liy+i][clocX], mat_kernelY[i], sum);
|
||||
#else
|
||||
sum = mad24(lsmem[liy+i][clocX], mat_kernelY[i], sum);
|
||||
#endif
|
||||
lsmemDy[liy][clocX] = sum;
|
||||
clocX += BLK_X;
|
||||
}
|
||||
@ -180,7 +182,13 @@ __kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int
|
||||
// and calculate final result
|
||||
sum = 0.0f;
|
||||
for (i=0; i<=2*RADIUSX; i++)
|
||||
#ifndef INTEGER_ARITHMETIC
|
||||
sum = mad(lsmemDy[liy][lix+i], mat_kernelX[i], sum);
|
||||
#else
|
||||
sum = mad24(lsmemDy[liy][lix+i], mat_kernelX[i], sum);
|
||||
|
||||
sum = (sum + (1 << (SHIFT_BITS-1))) >> SHIFT_BITS;
|
||||
#endif
|
||||
|
||||
// store result into destination image
|
||||
storepix(convertToDstT(sum + (WT)(delta)), Dst + mad24(y, dst_step, mad24(x, DSTSIZE, dst_offset)));
|
||||
|
@ -219,7 +219,7 @@ OCL_TEST_P(GaussianBlurTest, Mat)
|
||||
OCL_OFF(cv::GaussianBlur(src_roi, dst_roi, Size(ksize, ksize), sigma1, sigma2, borderType));
|
||||
OCL_ON(cv::GaussianBlur(usrc_roi, udst_roi, Size(ksize, ksize), sigma1, sigma2, borderType));
|
||||
|
||||
Near(CV_MAT_DEPTH(type) == CV_8U ? 3 : 5e-5, false);
|
||||
Near(CV_MAT_DEPTH(type) >= CV_32F ? 5e-5 : 1, false);
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user