diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp index b56f84c16..3a7c718d4 100644 --- a/modules/core/src/ocl.cpp +++ b/modules/core/src/ocl.cpp @@ -4317,8 +4317,8 @@ String kernelToStr(InputArray _kernel, int ddepth, const char * name) if (ddepth != depth) kernel.convertTo(kernel, ddepth); - typedef std::string (*func_t)(const Mat &); - static const func_t funcs[] = { kerToStr, kerToStr, kerToStr,kerToStr, + typedef std::string (* func_t)(const Mat &); + static const func_t funcs[] = { kerToStr, kerToStr, kerToStr, kerToStr, kerToStr, kerToStr, kerToStr, 0 }; const func_t func = funcs[depth]; CV_Assert(func != 0); diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.cpp index ba2e347af..c013a9b16 100644 --- a/modules/imgproc/src/filter.cpp +++ b/modules/imgproc/src/filter.cpp @@ -41,7 +41,6 @@ //M*/ #include "precomp.hpp" -#define CV_OPENCL_RUN_ASSERT #include "opencl_kernels.hpp" #include @@ -3135,7 +3134,7 @@ template struct Filter2D : public BaseFi // b e h b e h 0 0 // c f i c f i 0 0 template -static int _prepareKernelFilter2D(std::vector& data, const Mat &kernel) +static int _prepareKernelFilter2D(std::vector & data, const Mat & kernel) { Mat _kernel; kernel.convertTo(_kernel, DataDepth::value); int size_y_aligned = ROUNDUP(kernel.rows * 2, 4); @@ -3318,11 +3317,16 @@ static bool ocl_filter2D( InputArray _src, OutputArray _dst, int ddepth, return kernel.run(2, globalsize, localsize, true); } -static bool ocl_sepRowFilter2D( UMat &src, UMat &buf, Mat &kernelX, int anchor, int borderType) +static bool ocl_sepRowFilter2D(const UMat & src, UMat & buf, const Mat & kernelX, int anchor, + int borderType, int ddepth, bool fast8uc1) { int type = src.type(), cn = CV_MAT_CN(type), sdepth = CV_MAT_DEPTH(type); + bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; Size bufSize = buf.size(); + if (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F)) + return false; + #ifdef ANDROID size_t localsize[2] = {16, 10}; #else @@ -3330,7 +3334,7 @@ static bool ocl_sepRowFilter2D( UMat &src, UMat &buf, Mat &kernelX, int anchor, #endif size_t globalsize[2] = {DIVUP(bufSize.width, localsize[0]) * localsize[0], DIVUP(bufSize.height, localsize[1]) * localsize[1]}; - if (type == CV_8UC1) + if (fast8uc1) globalsize[0] = DIVUP((bufSize.width + 3) >> 2, localsize[0]) * localsize[0]; int radiusX = anchor, radiusY = (buf.rows - src.rows) >> 1; @@ -3346,20 +3350,21 @@ static bool ocl_sepRowFilter2D( UMat &src, UMat &buf, Mat &kernelX, int anchor, char cvt[40]; cv::String build_options = cv::format("-D RADIUSX=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D %s -D %s" - " -D srcT=%s -D dstT=%s -D convertToDstT=%s -D srcT1=%s -D dstT1=%s", + " -D srcT=%s -D dstT=%s -D convertToDstT=%s -D srcT1=%s -D dstT1=%s%s", radiusX, (int)localsize[0], (int)localsize[1], cn, btype, extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION", isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED", ocl::typeToStr(type), ocl::typeToStr(CV_32FC(cn)), ocl::convertTypeStr(sdepth, CV_32F, cn, cvt), - ocl::typeToStr(sdepth), ocl::typeToStr(CV_32F)); + ocl::typeToStr(sdepth), ocl::typeToStr(CV_32F), + doubleSupport ? " -D DOUBLE_SUPPORT" : ""); build_options += ocl::kernelToStr(kernelX, CV_32F); Size srcWholeSize; Point srcOffset; src.locateROI(srcWholeSize, srcOffset); String kernelName("row_filter"); - if (type == CV_8UC1) + if (fast8uc1) kernelName += "_C1_D0"; ocl::Kernel k(kernelName.c_str(), cv::ocl::imgproc::filterSepRow_oclsrc, @@ -3367,39 +3372,47 @@ static bool ocl_sepRowFilter2D( UMat &src, UMat &buf, Mat &kernelX, int anchor, if (k.empty()) return false; - k.args(ocl::KernelArg::PtrReadOnly(src), (int)(src.step / src.elemSize()), srcOffset.x, - srcOffset.y, src.cols, src.rows, srcWholeSize.width, srcWholeSize.height, - ocl::KernelArg::PtrWriteOnly(buf), (int)(buf.step / buf.elemSize()), - buf.cols, buf.rows, radiusY); + if (fast8uc1) + k.args(ocl::KernelArg::PtrReadOnly(src), (int)(src.step / src.elemSize()), srcOffset.x, + srcOffset.y, src.cols, src.rows, srcWholeSize.width, srcWholeSize.height, + ocl::KernelArg::PtrWriteOnly(buf), (int)(buf.step / buf.elemSize()), + buf.cols, buf.rows, radiusY); + else + k.args(ocl::KernelArg::PtrReadOnly(src), (int)src.step, srcOffset.x, + srcOffset.y, src.cols, src.rows, srcWholeSize.width, srcWholeSize.height, + ocl::KernelArg::PtrWriteOnly(buf), (int)buf.step, buf.cols, buf.rows, radiusY); return k.run(2, globalsize, localsize, false); } -static bool ocl_sepColFilter2D(const UMat &buf, UMat &dst, Mat &kernelY, int anchor) +static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY, int anchor) { + bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; + if (dst.depth() == CV_64F && !doubleSupport) + return false; + #ifdef ANDROID - size_t localsize[2] = {16, 10}; + size_t localsize[2] = { 16, 10 }; #else - size_t localsize[2] = {16, 16}; + size_t localsize[2] = { 16, 16 }; #endif - size_t globalsize[2] = {0, 0}; + size_t globalsize[2] = { 0, 0 }; int dtype = dst.type(), cn = CV_MAT_CN(dtype), ddepth = CV_MAT_DEPTH(dtype); Size sz = dst.size(); globalsize[1] = DIVUP(sz.height, localsize[1]) * localsize[1]; - - if (dtype == CV_8UC2) - globalsize[0] = DIVUP((sz.width + 1) / 2, localsize[0]) * localsize[0]; - else - globalsize[0] = DIVUP(sz.width, localsize[0]) * localsize[0]; + globalsize[0] = DIVUP(sz.width, localsize[0]) * localsize[0]; char cvt[40]; cv::String build_options = cv::format("-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d" - " -D srcT=%s -D dstT=%s -D convertToDstT=%s", + " -D srcT=%s -D dstT=%s -D convertToDstT=%s" + " -D srcT1=%s -D dstT1=%s%s", anchor, (int)localsize[0], (int)localsize[1], cn, ocl::typeToStr(buf.type()), ocl::typeToStr(dtype), - ocl::convertTypeStr(CV_32F, ddepth, cn, cvt)); + ocl::convertTypeStr(CV_32F, ddepth, cn, cvt), + ocl::typeToStr(CV_32F), ocl::typeToStr(ddepth), + doubleSupport ? " -D DOUBLE_SUPPORT" : ""); build_options += ocl::kernelToStr(kernelY, CV_32F); ocl::Kernel k("col_filter", cv::ocl::imgproc::filterSepCol_oclsrc, @@ -3407,13 +3420,13 @@ static bool ocl_sepColFilter2D(const UMat &buf, UMat &dst, Mat &kernelY, int anc if (k.empty()) return false; - k.args(ocl::KernelArg::PtrReadOnly(buf), (int)(buf.step / buf.elemSize()), buf.cols, - buf.rows, ocl::KernelArg::PtrWriteOnly(dst), (int)(dst.offset / dst.elemSize()), - (int)(dst.step / dst.elemSize()), dst.cols, dst.rows); + k.args(ocl::KernelArg::ReadOnly(buf), ocl::KernelArg::WriteOnly(dst)); return k.run(2, globalsize, localsize, false); } +#if 0 + const int optimizedSepFilterLocalSize = 16; static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst, @@ -3471,18 +3484,19 @@ static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst, return k.run(2, gt2, lt2, false); } +#endif + static bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth, InputArray _kernelX, InputArray _kernelY, Point anchor, double delta, int borderType ) { - Size imgSize = _src.size(); +// Size imgSize = _src.size(); if (abs(delta)> FLT_MIN) return false; int type = _src.type(), cn = CV_MAT_CN(type); - if ( !( (type == CV_8UC1 || type == CV_8UC4 || type == CV_32FC1 || type == CV_32FC4) && - (ddepth == CV_32F || ddepth == CV_16S || ddepth == CV_8U || ddepth < 0) ) ) + if (cn > 4) return false; Mat kernelX = _kernelX.getMat().reshape(1, 1); @@ -3501,9 +3515,6 @@ static bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth, if (ddepth < 0) ddepth = sdepth; -// printf("%d %d\n", imgSize.width, optimizedSepFilterLocalSize + (kernelX.rows >> 1)); -// printf("%d %d\n", imgSize.height, optimizedSepFilterLocalSize + (kernelY.rows >> 1)); - // CV_OCL_RUN_(kernelY.rows <= 21 && kernelX.rows <= 21 && // imgSize.width > optimizedSepFilterLocalSize + (kernelX.rows >> 1) && // imgSize.height > optimizedSepFilterLocalSize + (kernelY.rows >> 1), @@ -3512,20 +3523,19 @@ static bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth, UMat src = _src.getUMat(); Size srcWholeSize; Point srcOffset; src.locateROI(srcWholeSize, srcOffset); - if ( (0 != (srcOffset.x % 4)) || - (0 != (src.cols % 4)) || - (0 != ((src.step / src.elemSize()) % 4)) - ) - return false; + + bool fast8uc1 = type == CV_8UC1 && srcOffset.x % 4 == 0 && + src.cols % 4 == 0 && src.step % 4 == 0; Size srcSize = src.size(); Size bufSize(srcSize.width, srcSize.height + kernelY.cols - 1); - UMat buf; buf.create(bufSize, CV_MAKETYPE(CV_32F, cn)); - if (!ocl_sepRowFilter2D(src, buf, kernelX, anchor.x, borderType)) + UMat buf(bufSize, CV_32FC(cn)); + if (!ocl_sepRowFilter2D(src, buf, kernelX, anchor.x, borderType, ddepth, fast8uc1)) return false; _dst.create(srcSize, CV_MAKETYPE(ddepth, cn)); UMat dst = _dst.getUMat(); + return ocl_sepColFilter2D(buf, dst, kernelY, anchor.y); } diff --git a/modules/imgproc/src/opencl/filterSepCol.cl b/modules/imgproc/src/opencl/filterSepCol.cl index 05717c6ad..f5d270cf4 100644 --- a/modules/imgproc/src/opencl/filterSepCol.cl +++ b/modules/imgproc/src/opencl/filterSepCol.cl @@ -34,29 +34,36 @@ // // +#ifdef DOUBLE_SUPPORT +#ifdef cl_amd_fp64 +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#elif defined (cl_khr_fp64) +#pragma OPENCL EXTENSION cl_khr_fp64:enable +#endif +#endif + #define READ_TIMES_COL ((2*(RADIUSY+LSIZE1)-1)/LSIZE1) #define RADIUS 1 #define noconvert -/********************************************************************************** -These kernels are written for separable filters such as Sobel, Scharr, GaussianBlur. -Now(6/29/2011) the kernels only support 8U data type and the anchor of the convovle -kernel must be in the center. ROI is not supported either. -Each kernels read 4 elements(not 4 pixels), save them to LDS and read the data needed -from LDS to calculate the result. -The length of the convovle kernel supported is only related to the MAX size of LDS, -which is HW related. -Niko -6/29/2011 -The info above maybe obsolete. -***********************************************************************************/ +#if CN != 3 +#define loadpix(addr) *(__global const srcT *)(addr) +#define storepix(val, addr) *(__global dstT *)(addr) = val +#define SRCSIZE (int)sizeof(srcT) +#define DSTSIZE (int)sizeof(dstT) +#else +#define loadpix(addr) vload3(0, (__global const srcT1 *)(addr)) +#define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr)) +#define SRCSIZE (int)sizeof(srcT1)*3 +#define DSTSIZE (int)sizeof(dstT1)*3 +#endif #define DIG(a) a, __constant float mat_kernel[] = { COEFF }; -__kernel void col_filter(__global const srcT * src, int src_step_in_pixel, int src_whole_cols, int src_whole_rows, - __global dstT * dst, int dst_offset_in_pixel, int dst_step_in_pixel, int dst_cols, int dst_rows) +__kernel void col_filter(__global const uchar * src, int src_step, int src_offset, int src_whole_rows, int src_whole_cols, + __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols) { int x = get_global_id(0); int y = get_global_id(1); @@ -64,8 +71,8 @@ __kernel void col_filter(__global const srcT * src, int src_step_in_pixel, int s int l_x = get_local_id(0); int l_y = get_local_id(1); - int start_addr = mad24(y, src_step_in_pixel, x); - int end_addr = mad24(src_whole_rows - 1, src_step_in_pixel, src_whole_cols); + int start_addr = mad24(y, src_step, x * SRCSIZE); + int end_addr = mad24(src_whole_rows - 1, src_step, src_whole_cols * SRCSIZE); srcT sum, temp[READ_TIMES_COL]; __local srcT LDS_DAT[LSIZE1 * READ_TIMES_COL][LSIZE0 + 1]; @@ -73,9 +80,9 @@ __kernel void col_filter(__global const srcT * src, int src_step_in_pixel, int s // read pixels from src for (int i = 0; i < READ_TIMES_COL; ++i) { - int current_addr = mad24(i, LSIZE1 * src_step_in_pixel, start_addr); + int current_addr = mad24(i, LSIZE1 * src_step, start_addr); current_addr = current_addr < end_addr ? current_addr : 0; - temp[i] = src[current_addr]; + temp[i] = loadpix(src + current_addr); } // save pixels to lds @@ -95,7 +102,7 @@ __kernel void col_filter(__global const srcT * src, int src_step_in_pixel, int s // write the result to dst if (x < dst_cols && y < dst_rows) { - start_addr = mad24(y, dst_step_in_pixel, x + dst_offset_in_pixel); - dst[start_addr] = convertToDstT(sum); + start_addr = mad24(y, dst_step, mad24(DSTSIZE, x, dst_offset)); + storepix(convertToDstT(sum), dst + start_addr); } } diff --git a/modules/imgproc/src/opencl/filterSepRow.cl b/modules/imgproc/src/opencl/filterSepRow.cl index 83968dfc1..726de448e 100644 --- a/modules/imgproc/src/opencl/filterSepRow.cl +++ b/modules/imgproc/src/opencl/filterSepRow.cl @@ -34,6 +34,14 @@ // // +#ifdef DOUBLE_SUPPORT +#ifdef cl_amd_fp64 +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#elif defined (cl_khr_fp64) +#pragma OPENCL EXTENSION cl_khr_fp64:enable +#endif +#endif + #define READ_TIMES_ROW ((2*(RADIUSX+LSIZE0)-1)/LSIZE0) //for c4 only #define RADIUS 1 @@ -117,16 +125,16 @@ #define noconvert -#if cn != 3 +#if CN != 3 #define loadpix(addr) *(__global const srcT *)(addr) #define storepix(val, addr) *(__global dstT *)(addr) = val -#define SRCSIZE ((int)sizeof(srcT)) -#define DSTSIZE ((int)sizeof(dstT)) +#define SRCSIZE (int)sizeof(srcT) +#define DSTSIZE (int)sizeof(dstT) #else #define loadpix(addr) vload3(0, (__global const srcT1 *)(addr)) #define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr)) -#define SRCSIZE ((int)sizeof(srcT1)*3) -#define DSTSIZE ((int)sizeof(dstT1)*3) +#define SRCSIZE (int)sizeof(srcT1)*3 +#define DSTSIZE (int)sizeof(dstT1)*3 #endif #define DIG(a) a, @@ -269,32 +277,33 @@ __kernel void row_filter_C1_D0(__global const uchar * src, int src_step_in_pixel dst[start_addr] = sum.x; } -__kernel void row_filter(__global const srcT * src, int src_step_in_pixel, int src_offset_x, int src_offset_y, +__kernel void row_filter(__global const uchar * src, int src_step, int src_offset_x, int src_offset_y, int src_cols, int src_rows, int src_whole_cols, int src_whole_rows, - __global dstT * dst, int dst_step_in_pixel, int dst_cols, int dst_rows, + __global uchar * dst, int dst_step, int dst_cols, int dst_rows, int radiusy) { int x = get_global_id(0); int y = get_global_id(1); int l_x = get_local_id(0); int l_y = get_local_id(1); + int start_x = x + src_offset_x - RADIUSX; int start_y = y + src_offset_y - radiusy; - int start_addr = mad24(start_y, src_step_in_pixel, start_x); + int start_addr = mad24(start_y, src_step, start_x * SRCSIZE); dstT sum; srcT temp[READ_TIMES_ROW]; __local srcT LDS_DAT[LSIZE1][READ_TIMES_ROW * LSIZE0 + 1]; #ifdef BORDER_CONSTANT - int end_addr = mad24(src_whole_rows - 1, src_step_in_pixel, src_whole_cols); + int end_addr = mad24(src_whole_rows - 1, src_step, src_whole_cols * SRCSIZE); // read pixels from src for (int i = 0; i < READ_TIMES_ROW; i++) { - int current_addr = mad24(i, LSIZE0, start_addr); - current_addr = current_addr < end_addr && current_addr > 0 ? current_addr : 0; - temp[i] = src[current_addr]; + int current_addr = mad24(i, LSIZE0 * SRCSIZE, start_addr); + current_addr = current_addr < end_addr && current_addr >= 0 ? current_addr : 0; + temp[i] = loadpix(src + current_addr); } // judge if read out of boundary @@ -312,8 +321,7 @@ __kernel void row_filter(__global const srcT * src, int src_step_in_pixel, int s } #endif #else - int index[READ_TIMES_ROW]; - int s_x, s_y; + int index[READ_TIMES_ROW], s_x, s_y; // judge if read out of boundary for (int i = 0; i < READ_TIMES_ROW; ++i) @@ -328,12 +336,12 @@ __kernel void row_filter(__global const srcT * src, int src_step_in_pixel, int s EXTRAPOLATE(s_x, 0, src_whole_cols); EXTRAPOLATE(s_y, 0, src_whole_rows); #endif - index[i] = mad24(s_y, src_step_in_pixel, s_x); + index[i] = mad24(s_y, src_step, s_x * SRCSIZE); } // read pixels from src for (int i = 0; i < READ_TIMES_ROW; ++i) - temp[i] = src[index[i]]; + temp[i] = loadpix(src + index[i]); #endif // BORDER_CONSTANT // save pixels to lds @@ -349,10 +357,11 @@ __kernel void row_filter(__global const srcT * src, int src_step_in_pixel, int s temp[1] = LDS_DAT[l_y][l_x + RADIUSX + i]; sum += mad(convertToDstT(temp[0]), mat_kernel[RADIUSX - i], convertToDstT(temp[1]) * mat_kernel[RADIUSX + i]); } + // write the result to dst if (x < dst_cols && y < dst_rows) { - start_addr = mad24(y, dst_step_in_pixel, x); - dst[start_addr] = sum; + start_addr = mad24(y, dst_step, x * DSTSIZE); + storepix(sum, dst + start_addr); } } diff --git a/modules/imgproc/test/ocl/test_filters.cpp b/modules/imgproc/test/ocl/test_filters.cpp index fe16fe81d..04b330527 100644 --- a/modules/imgproc/test/ocl/test_filters.cpp +++ b/modules/imgproc/test/ocl/test_filters.cpp @@ -312,7 +312,7 @@ OCL_TEST_P(MorphologyEx, Mat) (int)BORDER_REFLECT|BORDER_ISOLATED, (int)BORDER_WRAP|BORDER_ISOLATED, \ (int)BORDER_REFLECT_101|BORDER_ISOLATED*/) // WRAP and ISOLATED are not supported by cv:: version -#define FILTER_TYPES Values(CV_8UC1, CV_8UC2, CV_8UC4, CV_32FC1, CV_32FC4, CV_64FC1, CV_64FC4) +#define FILTER_TYPES Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4) OCL_INSTANTIATE_TEST_CASE_P(Filter, Bilateral, Combine( Values((MatType)CV_8UC1), diff --git a/modules/imgproc/test/ocl/test_sepfilter2D.cpp b/modules/imgproc/test/ocl/test_sepfilter2D.cpp index 09d01d157..05d46cc2b 100644 --- a/modules/imgproc/test/ocl/test_sepfilter2D.cpp +++ b/modules/imgproc/test/ocl/test_sepfilter2D.cpp @@ -75,9 +75,9 @@ PARAM_TEST_CASE(SepFilter2D, MatDepth, Channels, BorderType, bool, bool) void random_roi() { Size ksize = randomSize(kernelMinSize, kernelMaxSize); - if (1 != (ksize.width % 2)) + if (1 != ksize.width % 2) ksize.width++; - if (1 != (ksize.height % 2)) + if (1 != ksize.height % 2) ksize.height++; Mat temp = randomMat(Size(ksize.width, 1), CV_MAKE_TYPE(CV_32F, 1), -MAX_VALUE, MAX_VALUE); @@ -86,24 +86,22 @@ PARAM_TEST_CASE(SepFilter2D, MatDepth, Channels, BorderType, bool, bool) cv::normalize(temp, kernelY, 1.0, 0.0, NORM_L1); Size roiSize = randomSize(ksize.width + 16, MAX_VALUE, ksize.height + 20, MAX_VALUE); - std::cout << roiSize << std::endl; int rest = roiSize.width % 4; - if (0 != rest) + if (rest != 0) roiSize.width += (4 - rest); Border srcBorder = randomBorder(0, useRoi ? MAX_VALUE : 0); rest = srcBorder.lef % 4; - if (0 != rest) + if (rest != 0) srcBorder.lef += (4 - rest); rest = srcBorder.rig % 4; - if (0 != rest) + if (rest != 0) srcBorder.rig += (4 - rest); randomSubMat(src, src_roi, roiSize, srcBorder, type, -MAX_VALUE, MAX_VALUE); Border dstBorder = randomBorder(0, useRoi ? MAX_VALUE : 0); randomSubMat(dst, dst_roi, roiSize, dstBorder, type, -MAX_VALUE, MAX_VALUE); - anchor.x = -1; - anchor.y = -1; + anchor.x = anchor.y = -1; UMAT_UPLOAD_INPUT_PARAMETER(src) UMAT_UPLOAD_OUTPUT_PARAMETER(dst) @@ -128,11 +126,10 @@ OCL_TEST_P(SepFilter2D, Mat) } } - OCL_INSTANTIATE_TEST_CASE_P(ImageProc, SepFilter2D, Combine( Values(CV_8U, CV_32F), - Values(1, 4), + OCL_ALL_CHANNELS, Values( (BorderType)BORDER_CONSTANT, (BorderType)BORDER_REPLICATE,