Enabled integer arithmetic for filterSepSinglePass
This commit is contained in:
parent
fc10ffefb8
commit
c747426fc1
@ -3388,12 +3388,12 @@ const int optimizedSepFilterLocalSize = 16;
|
|||||||
|
|
||||||
static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst,
|
static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst,
|
||||||
Mat row_kernel, Mat col_kernel,
|
Mat row_kernel, Mat col_kernel,
|
||||||
double delta, int borderType, int ddepth)
|
double delta, int borderType, int ddepth, int bdepth, bool int_arithm)
|
||||||
{
|
{
|
||||||
Size size = _src.size(), wholeSize;
|
Size size = _src.size(), wholeSize;
|
||||||
Point origin;
|
Point origin;
|
||||||
int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype),
|
int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype),
|
||||||
esz = CV_ELEM_SIZE(stype), wdepth = std::max(std::max(sdepth, ddepth), CV_32F),
|
esz = CV_ELEM_SIZE(stype), wdepth = std::max(std::max(sdepth, ddepth), bdepth),
|
||||||
dtype = CV_MAKE_TYPE(ddepth, cn);
|
dtype = CV_MAKE_TYPE(ddepth, cn);
|
||||||
size_t src_step = _src.step(), src_offset = _src.offset();
|
size_t src_step = _src.step(), src_offset = _src.offset();
|
||||||
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
|
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
|
||||||
@ -3413,14 +3413,15 @@ static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst,
|
|||||||
|
|
||||||
String opts = cv::format("-D BLK_X=%d -D BLK_Y=%d -D RADIUSX=%d -D RADIUSY=%d%s%s"
|
String opts = cv::format("-D BLK_X=%d -D BLK_Y=%d -D RADIUSX=%d -D RADIUSY=%d%s%s"
|
||||||
" -D srcT=%s -D convertToWT=%s -D WT=%s -D dstT=%s -D convertToDstT=%s"
|
" -D srcT=%s -D convertToWT=%s -D WT=%s -D dstT=%s -D convertToDstT=%s"
|
||||||
" -D %s -D srcT1=%s -D dstT1=%s -D CN=%d", (int)lt2[0], (int)lt2[1],
|
" -D %s -D srcT1=%s -D dstT1=%s -D CN=%d -D SHIFT_BITS=%d%s",
|
||||||
row_kernel.cols / 2, col_kernel.cols / 2,
|
(int)lt2[0], (int)lt2[1], row_kernel.cols / 2, col_kernel.cols / 2,
|
||||||
ocl::kernelToStr(row_kernel, CV_32F, "KERNEL_MATRIX_X").c_str(),
|
ocl::kernelToStr(row_kernel, wdepth, "KERNEL_MATRIX_X").c_str(),
|
||||||
ocl::kernelToStr(col_kernel, CV_32F, "KERNEL_MATRIX_Y").c_str(),
|
ocl::kernelToStr(col_kernel, wdepth, "KERNEL_MATRIX_Y").c_str(),
|
||||||
ocl::typeToStr(stype), ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]),
|
ocl::typeToStr(stype), ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]),
|
||||||
ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), ocl::typeToStr(dtype),
|
ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), ocl::typeToStr(dtype),
|
||||||
ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), borderMap[borderType],
|
ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), borderMap[borderType],
|
||||||
ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), cn);
|
ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), cn, 2*shift_bits,
|
||||||
|
int_arithm ? " -D INTEGER_ARITHMETIC" : "");
|
||||||
|
|
||||||
ocl::Kernel k("sep_filter", ocl::imgproc::filterSep_singlePass_oclsrc, opts);
|
ocl::Kernel k("sep_filter", ocl::imgproc::filterSep_singlePass_oclsrc, opts);
|
||||||
if (k.empty())
|
if (k.empty())
|
||||||
@ -3485,14 +3486,14 @@ static bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth,
|
|||||||
int_arithm = true;
|
int_arithm = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
CV_OCL_RUN_(kernelY.cols <= 21 && kernelX.cols <= 21 && !int_arithm &&
|
CV_OCL_RUN_(kernelY.cols <= 21 && kernelX.cols <= 21 &&
|
||||||
imgSize.width > optimizedSepFilterLocalSize + anchor.x &&
|
imgSize.width > optimizedSepFilterLocalSize + anchor.x &&
|
||||||
imgSize.height > optimizedSepFilterLocalSize + anchor.y &&
|
imgSize.height > optimizedSepFilterLocalSize + anchor.y &&
|
||||||
(!(borderType & BORDER_ISOLATED) || _src.offset() == 0) &&
|
(!(borderType & BORDER_ISOLATED) || _src.offset() == 0) &&
|
||||||
anchor == Point(kernelX.cols >> 1, kernelY.cols >> 1) &&
|
anchor == Point(kernelX.cols >> 1, kernelY.cols >> 1) &&
|
||||||
(d.isIntel() || (d.isAMD() && !d.hostUnifiedMemory())),
|
(d.isIntel() || (d.isAMD() && !d.hostUnifiedMemory())),
|
||||||
ocl_sepFilter2D_SinglePass(_src, _dst, kernelX, kernelY, delta,
|
ocl_sepFilter2D_SinglePass(_src, _dst, kernelX, kernelY, delta,
|
||||||
borderType & ~BORDER_ISOLATED, ddepth), true)
|
borderType & ~BORDER_ISOLATED, ddepth, bdepth, int_arithm), true)
|
||||||
|
|
||||||
UMat src = _src.getUMat();
|
UMat src = _src.getUMat();
|
||||||
Size srcWholeSize; Point srcOffset;
|
Size srcWholeSize; Point srcOffset;
|
||||||
|
@ -100,8 +100,8 @@
|
|||||||
// horizontal and vertical filter kernels
|
// horizontal and vertical filter kernels
|
||||||
// should be defined on host during compile time to avoid overhead
|
// should be defined on host during compile time to avoid overhead
|
||||||
#define DIG(a) a,
|
#define DIG(a) a,
|
||||||
__constant float mat_kernelX[] = { KERNEL_MATRIX_X };
|
__constant WT mat_kernelX[] = { KERNEL_MATRIX_X };
|
||||||
__constant float mat_kernelY[] = { KERNEL_MATRIX_Y };
|
__constant WT mat_kernelY[] = { KERNEL_MATRIX_Y };
|
||||||
|
|
||||||
__kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int srcOffsetY, int height, int width,
|
__kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int srcOffsetY, int height, int width,
|
||||||
__global uchar* Dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, float delta)
|
__global uchar* Dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, float delta)
|
||||||
@ -159,12 +159,16 @@ __kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int
|
|||||||
// do vertical filter pass
|
// do vertical filter pass
|
||||||
// and store intermediate results to second local memory array
|
// and store intermediate results to second local memory array
|
||||||
int i, clocX = lix;
|
int i, clocX = lix;
|
||||||
WT sum = 0.0f;
|
WT sum = (WT) 0;
|
||||||
do
|
do
|
||||||
{
|
{
|
||||||
sum = 0.0f;
|
sum = (WT) 0;
|
||||||
for (i=0; i<=2*RADIUSY; i++)
|
for (i=0; i<=2*RADIUSY; i++)
|
||||||
|
#ifndef INTEGER_ARITHMETIC
|
||||||
sum = mad(lsmem[liy+i][clocX], mat_kernelY[i], sum);
|
sum = mad(lsmem[liy+i][clocX], mat_kernelY[i], sum);
|
||||||
|
#else
|
||||||
|
sum = mad24(lsmem[liy+i][clocX], mat_kernelY[i], sum);
|
||||||
|
#endif
|
||||||
lsmemDy[liy][clocX] = sum;
|
lsmemDy[liy][clocX] = sum;
|
||||||
clocX += BLK_X;
|
clocX += BLK_X;
|
||||||
}
|
}
|
||||||
@ -180,8 +184,13 @@ __kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int
|
|||||||
// and calculate final result
|
// and calculate final result
|
||||||
sum = 0.0f;
|
sum = 0.0f;
|
||||||
for (i=0; i<=2*RADIUSX; i++)
|
for (i=0; i<=2*RADIUSX; i++)
|
||||||
|
#ifndef INTEGER_ARITHMETIC
|
||||||
sum = mad(lsmemDy[liy][lix+i], mat_kernelX[i], sum);
|
sum = mad(lsmemDy[liy][lix+i], mat_kernelX[i], sum);
|
||||||
|
#else
|
||||||
|
sum = mad24(lsmemDy[liy][lix+i], mat_kernelX[i], sum);
|
||||||
|
|
||||||
|
sum = (sum + (1 << (SHIFT_BITS-1))) >> SHIFT_BITS;
|
||||||
|
#endif
|
||||||
// store result into destination image
|
// store result into destination image
|
||||||
storepix(convertToDstT(sum + (WT)(delta)), Dst + mad24(y, dst_step, mad24(x, DSTSIZE, dst_offset)));
|
storepix(convertToDstT(sum + (WT)(delta)), Dst + mad24(y, dst_step, mad24(x, DSTSIZE, dst_offset)));
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user