generalized OpenCL version of cv::sepFilter2D; removed some restrictions and added 3-channels support

This commit is contained in:
Ilya Lavrenov 2014-03-19 18:49:33 +04:00
parent b449b0bf71
commit 291458a859
6 changed files with 112 additions and 89 deletions

View File

@ -4317,8 +4317,8 @@ String kernelToStr(InputArray _kernel, int ddepth, const char * name)
if (ddepth != depth)
kernel.convertTo(kernel, ddepth);
typedef std::string (*func_t)(const Mat &);
static const func_t funcs[] = { kerToStr<uchar>, kerToStr<char>, kerToStr<ushort>,kerToStr<short>,
typedef std::string (* func_t)(const Mat &);
static const func_t funcs[] = { kerToStr<uchar>, kerToStr<char>, kerToStr<ushort>, kerToStr<short>,
kerToStr<int>, kerToStr<float>, kerToStr<double>, 0 };
const func_t func = funcs[depth];
CV_Assert(func != 0);

View File

@ -41,7 +41,6 @@
//M*/
#include "precomp.hpp"
#define CV_OPENCL_RUN_ASSERT
#include "opencl_kernels.hpp"
#include <sstream>
@ -3135,7 +3134,7 @@ template<typename ST, class CastOp, class VecOp> struct Filter2D : public BaseFi
// b e h b e h 0 0
// c f i c f i 0 0
template <typename T>
static int _prepareKernelFilter2D(std::vector<T>& data, const Mat &kernel)
static int _prepareKernelFilter2D(std::vector<T> & data, const Mat & kernel)
{
Mat _kernel; kernel.convertTo(_kernel, DataDepth<T>::value);
int size_y_aligned = ROUNDUP(kernel.rows * 2, 4);
@ -3318,11 +3317,16 @@ static bool ocl_filter2D( InputArray _src, OutputArray _dst, int ddepth,
return kernel.run(2, globalsize, localsize, true);
}
static bool ocl_sepRowFilter2D( UMat &src, UMat &buf, Mat &kernelX, int anchor, int borderType)
static bool ocl_sepRowFilter2D(const UMat & src, UMat & buf, const Mat & kernelX, int anchor,
int borderType, int ddepth, bool fast8uc1)
{
int type = src.type(), cn = CV_MAT_CN(type), sdepth = CV_MAT_DEPTH(type);
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
Size bufSize = buf.size();
if (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F))
return false;
#ifdef ANDROID
size_t localsize[2] = {16, 10};
#else
@ -3330,7 +3334,7 @@ static bool ocl_sepRowFilter2D( UMat &src, UMat &buf, Mat &kernelX, int anchor,
#endif
size_t globalsize[2] = {DIVUP(bufSize.width, localsize[0]) * localsize[0], DIVUP(bufSize.height, localsize[1]) * localsize[1]};
if (type == CV_8UC1)
if (fast8uc1)
globalsize[0] = DIVUP((bufSize.width + 3) >> 2, localsize[0]) * localsize[0];
int radiusX = anchor, radiusY = (buf.rows - src.rows) >> 1;
@ -3346,20 +3350,21 @@ static bool ocl_sepRowFilter2D( UMat &src, UMat &buf, Mat &kernelX, int anchor,
char cvt[40];
cv::String build_options = cv::format("-D RADIUSX=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D %s -D %s"
" -D srcT=%s -D dstT=%s -D convertToDstT=%s -D srcT1=%s -D dstT1=%s",
" -D srcT=%s -D dstT=%s -D convertToDstT=%s -D srcT1=%s -D dstT1=%s%s",
radiusX, (int)localsize[0], (int)localsize[1], cn, btype,
extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION",
isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED",
ocl::typeToStr(type), ocl::typeToStr(CV_32FC(cn)),
ocl::convertTypeStr(sdepth, CV_32F, cn, cvt),
ocl::typeToStr(sdepth), ocl::typeToStr(CV_32F));
ocl::typeToStr(sdepth), ocl::typeToStr(CV_32F),
doubleSupport ? " -D DOUBLE_SUPPORT" : "");
build_options += ocl::kernelToStr(kernelX, CV_32F);
Size srcWholeSize; Point srcOffset;
src.locateROI(srcWholeSize, srcOffset);
String kernelName("row_filter");
if (type == CV_8UC1)
if (fast8uc1)
kernelName += "_C1_D0";
ocl::Kernel k(kernelName.c_str(), cv::ocl::imgproc::filterSepRow_oclsrc,
@ -3367,39 +3372,47 @@ static bool ocl_sepRowFilter2D( UMat &src, UMat &buf, Mat &kernelX, int anchor,
if (k.empty())
return false;
k.args(ocl::KernelArg::PtrReadOnly(src), (int)(src.step / src.elemSize()), srcOffset.x,
srcOffset.y, src.cols, src.rows, srcWholeSize.width, srcWholeSize.height,
ocl::KernelArg::PtrWriteOnly(buf), (int)(buf.step / buf.elemSize()),
buf.cols, buf.rows, radiusY);
if (fast8uc1)
k.args(ocl::KernelArg::PtrReadOnly(src), (int)(src.step / src.elemSize()), srcOffset.x,
srcOffset.y, src.cols, src.rows, srcWholeSize.width, srcWholeSize.height,
ocl::KernelArg::PtrWriteOnly(buf), (int)(buf.step / buf.elemSize()),
buf.cols, buf.rows, radiusY);
else
k.args(ocl::KernelArg::PtrReadOnly(src), (int)src.step, srcOffset.x,
srcOffset.y, src.cols, src.rows, srcWholeSize.width, srcWholeSize.height,
ocl::KernelArg::PtrWriteOnly(buf), (int)buf.step, buf.cols, buf.rows, radiusY);
return k.run(2, globalsize, localsize, false);
}
static bool ocl_sepColFilter2D(const UMat &buf, UMat &dst, Mat &kernelY, int anchor)
static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY, int anchor)
{
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
if (dst.depth() == CV_64F && !doubleSupport)
return false;
#ifdef ANDROID
size_t localsize[2] = {16, 10};
size_t localsize[2] = { 16, 10 };
#else
size_t localsize[2] = {16, 16};
size_t localsize[2] = { 16, 16 };
#endif
size_t globalsize[2] = {0, 0};
size_t globalsize[2] = { 0, 0 };
int dtype = dst.type(), cn = CV_MAT_CN(dtype), ddepth = CV_MAT_DEPTH(dtype);
Size sz = dst.size();
globalsize[1] = DIVUP(sz.height, localsize[1]) * localsize[1];
if (dtype == CV_8UC2)
globalsize[0] = DIVUP((sz.width + 1) / 2, localsize[0]) * localsize[0];
else
globalsize[0] = DIVUP(sz.width, localsize[0]) * localsize[0];
globalsize[0] = DIVUP(sz.width, localsize[0]) * localsize[0];
char cvt[40];
cv::String build_options = cv::format("-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d"
" -D srcT=%s -D dstT=%s -D convertToDstT=%s",
" -D srcT=%s -D dstT=%s -D convertToDstT=%s"
" -D srcT1=%s -D dstT1=%s%s",
anchor, (int)localsize[0], (int)localsize[1], cn,
ocl::typeToStr(buf.type()), ocl::typeToStr(dtype),
ocl::convertTypeStr(CV_32F, ddepth, cn, cvt));
ocl::convertTypeStr(CV_32F, ddepth, cn, cvt),
ocl::typeToStr(CV_32F), ocl::typeToStr(ddepth),
doubleSupport ? " -D DOUBLE_SUPPORT" : "");
build_options += ocl::kernelToStr(kernelY, CV_32F);
ocl::Kernel k("col_filter", cv::ocl::imgproc::filterSepCol_oclsrc,
@ -3407,13 +3420,13 @@ static bool ocl_sepColFilter2D(const UMat &buf, UMat &dst, Mat &kernelY, int anc
if (k.empty())
return false;
k.args(ocl::KernelArg::PtrReadOnly(buf), (int)(buf.step / buf.elemSize()), buf.cols,
buf.rows, ocl::KernelArg::PtrWriteOnly(dst), (int)(dst.offset / dst.elemSize()),
(int)(dst.step / dst.elemSize()), dst.cols, dst.rows);
k.args(ocl::KernelArg::ReadOnly(buf), ocl::KernelArg::WriteOnly(dst));
return k.run(2, globalsize, localsize, false);
}
#if 0
const int optimizedSepFilterLocalSize = 16;
static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst,
@ -3471,18 +3484,19 @@ static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst,
return k.run(2, gt2, lt2, false);
}
#endif
static bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth,
InputArray _kernelX, InputArray _kernelY, Point anchor,
double delta, int borderType )
{
Size imgSize = _src.size();
// Size imgSize = _src.size();
if (abs(delta)> FLT_MIN)
return false;
int type = _src.type(), cn = CV_MAT_CN(type);
if ( !( (type == CV_8UC1 || type == CV_8UC4 || type == CV_32FC1 || type == CV_32FC4) &&
(ddepth == CV_32F || ddepth == CV_16S || ddepth == CV_8U || ddepth < 0) ) )
if (cn > 4)
return false;
Mat kernelX = _kernelX.getMat().reshape(1, 1);
@ -3501,9 +3515,6 @@ static bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth,
if (ddepth < 0)
ddepth = sdepth;
// printf("%d %d\n", imgSize.width, optimizedSepFilterLocalSize + (kernelX.rows >> 1));
// printf("%d %d\n", imgSize.height, optimizedSepFilterLocalSize + (kernelY.rows >> 1));
// CV_OCL_RUN_(kernelY.rows <= 21 && kernelX.rows <= 21 &&
// imgSize.width > optimizedSepFilterLocalSize + (kernelX.rows >> 1) &&
// imgSize.height > optimizedSepFilterLocalSize + (kernelY.rows >> 1),
@ -3512,20 +3523,19 @@ static bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth,
UMat src = _src.getUMat();
Size srcWholeSize; Point srcOffset;
src.locateROI(srcWholeSize, srcOffset);
if ( (0 != (srcOffset.x % 4)) ||
(0 != (src.cols % 4)) ||
(0 != ((src.step / src.elemSize()) % 4))
)
return false;
bool fast8uc1 = type == CV_8UC1 && srcOffset.x % 4 == 0 &&
src.cols % 4 == 0 && src.step % 4 == 0;
Size srcSize = src.size();
Size bufSize(srcSize.width, srcSize.height + kernelY.cols - 1);
UMat buf; buf.create(bufSize, CV_MAKETYPE(CV_32F, cn));
if (!ocl_sepRowFilter2D(src, buf, kernelX, anchor.x, borderType))
UMat buf(bufSize, CV_32FC(cn));
if (!ocl_sepRowFilter2D(src, buf, kernelX, anchor.x, borderType, ddepth, fast8uc1))
return false;
_dst.create(srcSize, CV_MAKETYPE(ddepth, cn));
UMat dst = _dst.getUMat();
return ocl_sepColFilter2D(buf, dst, kernelY, anchor.y);
}

View File

@ -34,29 +34,36 @@
//
//
#ifdef DOUBLE_SUPPORT
#ifdef cl_amd_fp64
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#elif defined (cl_khr_fp64)
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#endif
#endif
#define READ_TIMES_COL ((2*(RADIUSY+LSIZE1)-1)/LSIZE1)
#define RADIUS 1
#define noconvert
/**********************************************************************************
These kernels are written for separable filters such as Sobel, Scharr, GaussianBlur.
Now(6/29/2011) the kernels only support 8U data type and the anchor of the convovle
kernel must be in the center. ROI is not supported either.
Each kernels read 4 elements(not 4 pixels), save them to LDS and read the data needed
from LDS to calculate the result.
The length of the convovle kernel supported is only related to the MAX size of LDS,
which is HW related.
Niko
6/29/2011
The info above maybe obsolete.
***********************************************************************************/
#if CN != 3
#define loadpix(addr) *(__global const srcT *)(addr)
#define storepix(val, addr) *(__global dstT *)(addr) = val
#define SRCSIZE (int)sizeof(srcT)
#define DSTSIZE (int)sizeof(dstT)
#else
#define loadpix(addr) vload3(0, (__global const srcT1 *)(addr))
#define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr))
#define SRCSIZE (int)sizeof(srcT1)*3
#define DSTSIZE (int)sizeof(dstT1)*3
#endif
#define DIG(a) a,
__constant float mat_kernel[] = { COEFF };
__kernel void col_filter(__global const srcT * src, int src_step_in_pixel, int src_whole_cols, int src_whole_rows,
__global dstT * dst, int dst_offset_in_pixel, int dst_step_in_pixel, int dst_cols, int dst_rows)
__kernel void col_filter(__global const uchar * src, int src_step, int src_offset, int src_whole_rows, int src_whole_cols,
__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols)
{
int x = get_global_id(0);
int y = get_global_id(1);
@ -64,8 +71,8 @@ __kernel void col_filter(__global const srcT * src, int src_step_in_pixel, int s
int l_x = get_local_id(0);
int l_y = get_local_id(1);
int start_addr = mad24(y, src_step_in_pixel, x);
int end_addr = mad24(src_whole_rows - 1, src_step_in_pixel, src_whole_cols);
int start_addr = mad24(y, src_step, x * SRCSIZE);
int end_addr = mad24(src_whole_rows - 1, src_step, src_whole_cols * SRCSIZE);
srcT sum, temp[READ_TIMES_COL];
__local srcT LDS_DAT[LSIZE1 * READ_TIMES_COL][LSIZE0 + 1];
@ -73,9 +80,9 @@ __kernel void col_filter(__global const srcT * src, int src_step_in_pixel, int s
// read pixels from src
for (int i = 0; i < READ_TIMES_COL; ++i)
{
int current_addr = mad24(i, LSIZE1 * src_step_in_pixel, start_addr);
int current_addr = mad24(i, LSIZE1 * src_step, start_addr);
current_addr = current_addr < end_addr ? current_addr : 0;
temp[i] = src[current_addr];
temp[i] = loadpix(src + current_addr);
}
// save pixels to lds
@ -95,7 +102,7 @@ __kernel void col_filter(__global const srcT * src, int src_step_in_pixel, int s
// write the result to dst
if (x < dst_cols && y < dst_rows)
{
start_addr = mad24(y, dst_step_in_pixel, x + dst_offset_in_pixel);
dst[start_addr] = convertToDstT(sum);
start_addr = mad24(y, dst_step, mad24(DSTSIZE, x, dst_offset));
storepix(convertToDstT(sum), dst + start_addr);
}
}

View File

@ -34,6 +34,14 @@
//
//
#ifdef DOUBLE_SUPPORT
#ifdef cl_amd_fp64
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#elif defined (cl_khr_fp64)
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#endif
#endif
#define READ_TIMES_ROW ((2*(RADIUSX+LSIZE0)-1)/LSIZE0) //for c4 only
#define RADIUS 1
@ -117,16 +125,16 @@
#define noconvert
#if cn != 3
#if CN != 3
#define loadpix(addr) *(__global const srcT *)(addr)
#define storepix(val, addr) *(__global dstT *)(addr) = val
#define SRCSIZE ((int)sizeof(srcT))
#define DSTSIZE ((int)sizeof(dstT))
#define SRCSIZE (int)sizeof(srcT)
#define DSTSIZE (int)sizeof(dstT)
#else
#define loadpix(addr) vload3(0, (__global const srcT1 *)(addr))
#define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr))
#define SRCSIZE ((int)sizeof(srcT1)*3)
#define DSTSIZE ((int)sizeof(dstT1)*3)
#define SRCSIZE (int)sizeof(srcT1)*3
#define DSTSIZE (int)sizeof(dstT1)*3
#endif
#define DIG(a) a,
@ -269,32 +277,33 @@ __kernel void row_filter_C1_D0(__global const uchar * src, int src_step_in_pixel
dst[start_addr] = sum.x;
}
__kernel void row_filter(__global const srcT * src, int src_step_in_pixel, int src_offset_x, int src_offset_y,
__kernel void row_filter(__global const uchar * src, int src_step, int src_offset_x, int src_offset_y,
int src_cols, int src_rows, int src_whole_cols, int src_whole_rows,
__global dstT * dst, int dst_step_in_pixel, int dst_cols, int dst_rows,
__global uchar * dst, int dst_step, int dst_cols, int dst_rows,
int radiusy)
{
int x = get_global_id(0);
int y = get_global_id(1);
int l_x = get_local_id(0);
int l_y = get_local_id(1);
int start_x = x + src_offset_x - RADIUSX;
int start_y = y + src_offset_y - radiusy;
int start_addr = mad24(start_y, src_step_in_pixel, start_x);
int start_addr = mad24(start_y, src_step, start_x * SRCSIZE);
dstT sum;
srcT temp[READ_TIMES_ROW];
__local srcT LDS_DAT[LSIZE1][READ_TIMES_ROW * LSIZE0 + 1];
#ifdef BORDER_CONSTANT
int end_addr = mad24(src_whole_rows - 1, src_step_in_pixel, src_whole_cols);
int end_addr = mad24(src_whole_rows - 1, src_step, src_whole_cols * SRCSIZE);
// read pixels from src
for (int i = 0; i < READ_TIMES_ROW; i++)
{
int current_addr = mad24(i, LSIZE0, start_addr);
current_addr = current_addr < end_addr && current_addr > 0 ? current_addr : 0;
temp[i] = src[current_addr];
int current_addr = mad24(i, LSIZE0 * SRCSIZE, start_addr);
current_addr = current_addr < end_addr && current_addr >= 0 ? current_addr : 0;
temp[i] = loadpix(src + current_addr);
}
// judge if read out of boundary
@ -312,8 +321,7 @@ __kernel void row_filter(__global const srcT * src, int src_step_in_pixel, int s
}
#endif
#else
int index[READ_TIMES_ROW];
int s_x, s_y;
int index[READ_TIMES_ROW], s_x, s_y;
// judge if read out of boundary
for (int i = 0; i < READ_TIMES_ROW; ++i)
@ -328,12 +336,12 @@ __kernel void row_filter(__global const srcT * src, int src_step_in_pixel, int s
EXTRAPOLATE(s_x, 0, src_whole_cols);
EXTRAPOLATE(s_y, 0, src_whole_rows);
#endif
index[i] = mad24(s_y, src_step_in_pixel, s_x);
index[i] = mad24(s_y, src_step, s_x * SRCSIZE);
}
// read pixels from src
for (int i = 0; i < READ_TIMES_ROW; ++i)
temp[i] = src[index[i]];
temp[i] = loadpix(src + index[i]);
#endif // BORDER_CONSTANT
// save pixels to lds
@ -349,10 +357,11 @@ __kernel void row_filter(__global const srcT * src, int src_step_in_pixel, int s
temp[1] = LDS_DAT[l_y][l_x + RADIUSX + i];
sum += mad(convertToDstT(temp[0]), mat_kernel[RADIUSX - i], convertToDstT(temp[1]) * mat_kernel[RADIUSX + i]);
}
// write the result to dst
if (x < dst_cols && y < dst_rows)
{
start_addr = mad24(y, dst_step_in_pixel, x);
dst[start_addr] = sum;
start_addr = mad24(y, dst_step, x * DSTSIZE);
storepix(sum, dst + start_addr);
}
}

View File

@ -312,7 +312,7 @@ OCL_TEST_P(MorphologyEx, Mat)
(int)BORDER_REFLECT|BORDER_ISOLATED, (int)BORDER_WRAP|BORDER_ISOLATED, \
(int)BORDER_REFLECT_101|BORDER_ISOLATED*/) // WRAP and ISOLATED are not supported by cv:: version
#define FILTER_TYPES Values(CV_8UC1, CV_8UC2, CV_8UC4, CV_32FC1, CV_32FC4, CV_64FC1, CV_64FC4)
#define FILTER_TYPES Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4)
OCL_INSTANTIATE_TEST_CASE_P(Filter, Bilateral, Combine(
Values((MatType)CV_8UC1),

View File

@ -75,9 +75,9 @@ PARAM_TEST_CASE(SepFilter2D, MatDepth, Channels, BorderType, bool, bool)
void random_roi()
{
Size ksize = randomSize(kernelMinSize, kernelMaxSize);
if (1 != (ksize.width % 2))
if (1 != ksize.width % 2)
ksize.width++;
if (1 != (ksize.height % 2))
if (1 != ksize.height % 2)
ksize.height++;
Mat temp = randomMat(Size(ksize.width, 1), CV_MAKE_TYPE(CV_32F, 1), -MAX_VALUE, MAX_VALUE);
@ -86,24 +86,22 @@ PARAM_TEST_CASE(SepFilter2D, MatDepth, Channels, BorderType, bool, bool)
cv::normalize(temp, kernelY, 1.0, 0.0, NORM_L1);
Size roiSize = randomSize(ksize.width + 16, MAX_VALUE, ksize.height + 20, MAX_VALUE);
std::cout << roiSize << std::endl;
int rest = roiSize.width % 4;
if (0 != rest)
if (rest != 0)
roiSize.width += (4 - rest);
Border srcBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
rest = srcBorder.lef % 4;
if (0 != rest)
if (rest != 0)
srcBorder.lef += (4 - rest);
rest = srcBorder.rig % 4;
if (0 != rest)
if (rest != 0)
srcBorder.rig += (4 - rest);
randomSubMat(src, src_roi, roiSize, srcBorder, type, -MAX_VALUE, MAX_VALUE);
Border dstBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
randomSubMat(dst, dst_roi, roiSize, dstBorder, type, -MAX_VALUE, MAX_VALUE);
anchor.x = -1;
anchor.y = -1;
anchor.x = anchor.y = -1;
UMAT_UPLOAD_INPUT_PARAMETER(src)
UMAT_UPLOAD_OUTPUT_PARAMETER(dst)
@ -128,11 +126,10 @@ OCL_TEST_P(SepFilter2D, Mat)
}
}
OCL_INSTANTIATE_TEST_CASE_P(ImageProc, SepFilter2D,
Combine(
Values(CV_8U, CV_32F),
Values(1, 4),
OCL_ALL_CHANNELS,
Values(
(BorderType)BORDER_CONSTANT,
(BorderType)BORDER_REPLICATE,