diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp index 950fa4199..e3cfd8e7c 100644 --- a/modules/core/src/ocl.cpp +++ b/modules/core/src/ocl.cpp @@ -4379,7 +4379,7 @@ String kernelToStr(InputArray _kernel, int ddepth, const char * name) typedef std::string (* func_t)(const Mat &); static const func_t funcs[] = { kerToStr, kerToStr, kerToStr, kerToStr, kerToStr, kerToStr, kerToStr, 0 }; - const func_t func = funcs[depth]; + const func_t func = funcs[ddepth]; CV_Assert(func != 0); return cv::format(" -D %s=%s", name ? name : "COEFF", func(kernel).c_str()); diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.cpp index 4861207c0..aa1631d5c 100644 --- a/modules/imgproc/src/filter.cpp +++ b/modules/imgproc/src/filter.cpp @@ -3191,11 +3191,10 @@ static bool ocl_filter2D( InputArray _src, OutputArray _dst, int ddepth, "BORDER_WRAP", "BORDER_REFLECT_101" }; cv::Mat kernelMat = _kernel.getMat(); - std::vector kernelMatDataFloat; - int kernel_size_y2_aligned = _prepareKernelFilter2D(kernelMatDataFloat, kernelMat); - cv::Size sz = _src.size(), wholeSize; - size_t globalsize[2] = { sz.width, sz.height }, localsize[2] = { 0, 1 }; + size_t globalsize[2] = { sz.width, sz.height }; + size_t localsize_general[2] = {0, 1}; + size_t* localsize = NULL; ocl::Kernel k; UMat src = _src.getUMat(); @@ -3210,63 +3209,134 @@ static bool ocl_filter2D( InputArray _src, OutputArray _dst, int ddepth, size_t tryWorkItems = maxWorkItemSizes[0]; char cvt[2][40]; - String kerStr = ocl::kernelToStr(kernelMatDataFloat, CV_32F); - - for ( ; ; ) + // For smaller filter kernels, there is a special kernel that is more + // efficient than the general one. + UMat kernalDataUMat; + if (device.isIntel() && (device.type() & ocl::Device::TYPE_GPU) && + ((ksize.width < 5 && ksize.height < 5) || + (ksize.width == 5 && ksize.height == 5 && cn == 1))) { - size_t BLOCK_SIZE = tryWorkItems; - while (BLOCK_SIZE > 32 && BLOCK_SIZE >= (size_t)ksize.width * 2 && BLOCK_SIZE > (size_t)sz.width * 2) - BLOCK_SIZE /= 2; -#if 1 // TODO Mode with several blocks requires a much more VGPRs, so this optimization is not actual for the current devices - size_t BLOCK_SIZE_Y = 1; -#else - size_t BLOCK_SIZE_Y = 8; // TODO Check heuristic value on devices - while (BLOCK_SIZE_Y < BLOCK_SIZE / 8 && BLOCK_SIZE_Y * src.clCxt->getDeviceInfo().maxComputeUnits * 32 < (size_t)src.rows) - BLOCK_SIZE_Y *= 2; -#endif - - if ((size_t)ksize.width > BLOCK_SIZE) - return false; - - int requiredTop = anchor.y; - int requiredLeft = (int)BLOCK_SIZE; // not this: anchor.x; - int requiredBottom = ksize.height - 1 - anchor.y; - int requiredRight = (int)BLOCK_SIZE; // not this: ksize.width - 1 - anchor.x; + kernelMat.reshape(0, 1); + String kerStr = ocl::kernelToStr(kernelMat, CV_32F); int h = isolated ? sz.height : wholeSize.height; int w = isolated ? sz.width : wholeSize.width; - bool extra_extrapolation = h < requiredTop || h < requiredBottom || w < requiredLeft || w < requiredRight; if ((w < ksize.width) || (h < ksize.height)) return false; - String opts = format("-D LOCAL_SIZE=%d -D BLOCK_SIZE_Y=%d -D cn=%d " - "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d " - "-D KERNEL_SIZE_Y2_ALIGNED=%d -D %s -D %s -D %s%s%s " - "-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D WT=%s -D WT1=%s " - "-D convertToWT=%s -D convertToDstT=%s", - (int)BLOCK_SIZE, (int)BLOCK_SIZE_Y, cn, anchor.x, anchor.y, - ksize.width, ksize.height, kernel_size_y2_aligned, borderMap[borderType], - extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION", - isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED", - doubleSupport ? " -D DOUBLE_SUPPORT" : "", kerStr.c_str(), - ocl::typeToStr(type), ocl::typeToStr(sdepth), ocl::typeToStr(dtype), - ocl::typeToStr(ddepth), ocl::typeToStr(wtype), ocl::typeToStr(wdepth), - ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), - ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1])); + // Figure out what vector size to use for loading the pixels. + int pxLoadNumPixels = ((cn != 1) || sz.width % 4) ? 1 : 4; + int pxLoadVecSize = cn * pxLoadNumPixels; - localsize[0] = BLOCK_SIZE; - globalsize[0] = DIVUP(sz.width, BLOCK_SIZE - (ksize.width - 1)) * BLOCK_SIZE; - globalsize[1] = DIVUP(sz.height, BLOCK_SIZE_Y); + // Figure out how many pixels per work item to compute in X and Y + // directions. Too many and we run out of registers. + int pxPerWorkItemX = 1; + int pxPerWorkItemY = 1; + if (cn <= 2 && ksize.width <= 4 && ksize.height <= 4) + { + pxPerWorkItemX = sz.width % 8 ? sz.width % 4 ? sz.width % 2 ? 1 : 2 : 4 : 8; + pxPerWorkItemY = sz.width % 2 ? 1 : 2; + } + else if (cn < 4 || (ksize.width <= 4 && ksize.height <= 4)) + { + pxPerWorkItemX = sz.width % 2 ? 1 : 2; + pxPerWorkItemY = sz.width % 2 ? 1 : 2; + } + globalsize[0] = sz.width / pxPerWorkItemX; + globalsize[1] = sz.height / pxPerWorkItemY; - if (!k.create("filter2D", cv::ocl::imgproc::filter2D_oclsrc, opts)) + // Need some padding in the private array for pixels + int privDataWidth = ROUNDUP(pxPerWorkItemX + ksize.width - 1, pxLoadNumPixels); + + // Make the global size a nice round number so the runtime can pick + // from reasonable choices for the workgroup size + const int wgRound = 256; + globalsize[0] = ROUNDUP(globalsize[0], wgRound); + + char build_options[1024]; + sprintf(build_options, "-D cn=%d " + "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d " + "-D PX_LOAD_VEC_SIZE=%d -D PX_LOAD_NUM_PX=%d " + "-D PX_PER_WI_X=%d -D PX_PER_WI_Y=%d -D PRIV_DATA_WIDTH=%d -D %s -D %s " + "-D PX_LOAD_X_ITERATIONS=%d -D PX_LOAD_Y_ITERATIONS=%d " + "-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D WT=%s -D WT1=%s " + "-D convertToWT=%s -D convertToDstT=%s %s", + cn, anchor.x, anchor.y, ksize.width, ksize.height, + pxLoadVecSize, pxLoadNumPixels, + pxPerWorkItemX, pxPerWorkItemY, privDataWidth, borderMap[borderType], + isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED", + privDataWidth / pxLoadNumPixels, pxPerWorkItemY + ksize.height - 1, + ocl::typeToStr(type), ocl::typeToStr(sdepth), ocl::typeToStr(dtype), + ocl::typeToStr(ddepth), ocl::typeToStr(wtype), ocl::typeToStr(wdepth), + ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), + ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), kerStr.c_str()); + cv::String errmsg; + if (!k.create("filter2DSmall", cv::ocl::imgproc::filter2DSmall_oclsrc, build_options, &errmsg)) return false; + } + else + { + localsize = localsize_general; + std::vector kernelMatDataFloat; + int kernel_size_y2_aligned = _prepareKernelFilter2D(kernelMatDataFloat, kernelMat); + String kerStr = ocl::kernelToStr(kernelMatDataFloat, CV_32F); - size_t kernelWorkGroupSize = k.workGroupSize(); - if (localsize[0] <= kernelWorkGroupSize) - break; - if (BLOCK_SIZE < kernelWorkGroupSize) - return false; - tryWorkItems = kernelWorkGroupSize; + for ( ; ; ) + { + size_t BLOCK_SIZE = tryWorkItems; + while (BLOCK_SIZE > 32 && BLOCK_SIZE >= (size_t)ksize.width * 2 && BLOCK_SIZE > (size_t)sz.width * 2) + BLOCK_SIZE /= 2; + #if 1 // TODO Mode with several blocks requires a much more VGPRs, so this optimization is not actual for the current devices + size_t BLOCK_SIZE_Y = 1; + #else + size_t BLOCK_SIZE_Y = 8; // TODO Check heuristic value on devices + while (BLOCK_SIZE_Y < BLOCK_SIZE / 8 && BLOCK_SIZE_Y * src.clCxt->getDeviceInfo().maxComputeUnits * 32 < (size_t)src.rows) + BLOCK_SIZE_Y *= 2; + #endif + + if ((size_t)ksize.width > BLOCK_SIZE) + return false; + + int requiredTop = anchor.y; + int requiredLeft = (int)BLOCK_SIZE; // not this: anchor.x; + int requiredBottom = ksize.height - 1 - anchor.y; + int requiredRight = (int)BLOCK_SIZE; // not this: ksize.width - 1 - anchor.x; + int h = isolated ? sz.height : wholeSize.height; + int w = isolated ? sz.width : wholeSize.width; + bool extra_extrapolation = h < requiredTop || h < requiredBottom || w < requiredLeft || w < requiredRight; + + if ((w < ksize.width) || (h < ksize.height)) + return false; + + String opts = format("-D LOCAL_SIZE=%d -D BLOCK_SIZE_Y=%d -D cn=%d " + "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d " + "-D KERNEL_SIZE_Y2_ALIGNED=%d -D %s -D %s -D %s%s%s " + "-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D WT=%s -D WT1=%s " + "-D convertToWT=%s -D convertToDstT=%s", + (int)BLOCK_SIZE, (int)BLOCK_SIZE_Y, cn, anchor.x, anchor.y, + ksize.width, ksize.height, kernel_size_y2_aligned, borderMap[borderType], + extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION", + isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED", + doubleSupport ? " -D DOUBLE_SUPPORT" : "", kerStr.c_str(), + ocl::typeToStr(type), ocl::typeToStr(sdepth), ocl::typeToStr(dtype), + ocl::typeToStr(ddepth), ocl::typeToStr(wtype), ocl::typeToStr(wdepth), + ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), + ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1])); + + localsize[0] = BLOCK_SIZE; + globalsize[0] = DIVUP(sz.width, BLOCK_SIZE - (ksize.width - 1)) * BLOCK_SIZE; + globalsize[1] = DIVUP(sz.height, BLOCK_SIZE_Y); + + if (!k.create("filter2D", cv::ocl::imgproc::filter2D_oclsrc, opts)) + return false; + + size_t kernelWorkGroupSize = k.workGroupSize(); + if (localsize[0] <= kernelWorkGroupSize) + break; + if (BLOCK_SIZE < kernelWorkGroupSize) + return false; + tryWorkItems = kernelWorkGroupSize; + } } _dst.create(sz, dtype); @@ -3688,9 +3758,20 @@ void cv::filter2D( InputArray _src, OutputArray _dst, int ddepth, temp = dst; else temp.create(dst.size(), dst.type()); - crossCorr( src, kernel, temp, src.size(), - CV_MAKETYPE(ddepth, src.channels()), - anchor, delta, borderType ); + // crossCorr doesn't accept non-zero delta with multiple channels + if( src.channels() != 1 && delta != 0 ) + { + crossCorr( src, kernel, temp, src.size(), + CV_MAKETYPE(ddepth, src.channels()), + anchor, 0, borderType ); + add( temp, delta, temp ); + } + else + { + crossCorr( src, kernel, temp, src.size(), + CV_MAKETYPE(ddepth, src.channels()), + anchor, delta, borderType ); + } if( temp.data != dst.data ) temp.copyTo(dst); return; diff --git a/modules/imgproc/src/opencl/filter2DSmall.cl b/modules/imgproc/src/opencl/filter2DSmall.cl new file mode 100755 index 000000000..67edef277 --- /dev/null +++ b/modules/imgproc/src/opencl/filter2DSmall.cl @@ -0,0 +1,335 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved. +// Copyright (C) 2014, Intel Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifdef BORDER_REPLICATE +//BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh +#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i)) +#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr)) +#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) :(i)) +#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr)) +#endif + +#ifdef BORDER_REFLECT +//BORDER_REFLECT: fedcba|abcdefgh|hgfedcb +#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i)-1 : (i)) +#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr)) +#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i)-1 : (i)) +#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr)) +#endif + +#ifdef BORDER_REFLECT_101 +//BORDER_REFLECT_101: gfedcb|abcdefgh|gfedcba +#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i) : (i)) +#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr)) +#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i) : (i)) +#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr)) +#endif + +//blur function does not support BORDER_WRAP +#ifdef BORDER_WRAP +//BORDER_WRAP: cdefgh|abcdefgh|abcdefg +#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (i)+(r_edge) : (i)) +#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (i)-(r_edge) : (addr)) +#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (i)+(b_edge) : (i)) +#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (i)-(b_edge) : (addr)) +#endif + +#ifdef BORDER_ISOLATED +#define ISOLATED_MIN(VAL) (VAL) +#else +#define ISOLATED_MIN(VAL) 0 +#endif + +#ifdef EXTRA_EXTRAPOLATION // border > src image size +#ifdef BORDER_CONSTANT +// None +#elif defined BORDER_REPLICATE +#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \ + { \ + x = max(min(x, maxX - 1), minX); \ + y = max(min(y, maxY - 1), minY); \ + } +#elif defined BORDER_WRAP +#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \ + { \ + if (x < minX) \ + x -= ((x - maxX + 1) / maxX) * maxX; \ + if (x >= maxX) \ + x %= maxX; \ + if (y < minY) \ + y -= ((y - maxY + 1) / maxY) * maxY; \ + if (y >= maxY) \ + y %= maxY; \ + } +#elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT_101) +#define EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, delta) \ + { \ + if (maxX - minX == 1) \ + x = minX; \ + else \ + do \ + { \ + if (x < minX) \ + x = minX - (x - minX) - 1 + delta; \ + else \ + x = maxX - 1 - (x - maxX) - delta; \ + } \ + while (x >= maxX || x < minX); \ + \ + if (maxY - minY == 1) \ + y = minY; \ + else \ + do \ + { \ + if (y < minY) \ + y = minY - (y - minY) - 1 + delta; \ + else \ + y = maxY - 1 - (y - maxY) - delta; \ + } \ + while (y >= maxY || y < minY); \ + } +#ifdef BORDER_REFLECT +#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, 0) +#elif defined(BORDER_REFLECT_101) || defined(BORDER_REFLECT101) +#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, 1) +#endif +#else +#error No extrapolation method +#endif +#else +#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \ + { \ + int _row = y - ISOLATED_MIN(minY), _col = x - ISOLATED_MIN(minX); \ + _row = ADDR_H(_row, 0, maxY - ISOLATED_MIN(minY)); \ + _row = ADDR_B(_row, maxY - ISOLATED_MIN(minY), _row); \ + y = _row + ISOLATED_MIN(minY); \ + \ + _col = ADDR_L(_col, 0, maxX - ISOLATED_MIN(minX)); \ + _col = ADDR_R(_col, maxX - ISOLATED_MIN(minX), _col); \ + x = _col + ISOLATED_MIN(minX); \ + } +#endif + +#ifdef DOUBLE_SUPPORT +#ifdef cl_amd_fp64 +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#elif defined (cl_khr_fp64) +#pragma OPENCL EXTENSION cl_khr_fp64:enable +#endif +#endif + +#if cn != 3 +#define loadpix(addr) *(__global const srcT *)(addr) +#define storepix(val, addr) *(__global dstT *)(addr) = val +#define SRCSIZE (int)sizeof(srcT) +#define DSTSIZE (int)sizeof(dstT) +#else +#define loadpix(addr) vload3(0, (__global const srcT1 *)(addr)) +#define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr)) +#define SRCSIZE (int)sizeof(srcT1) * cn +#define DSTSIZE (int)sizeof(dstT1) * cn +#endif + +#define noconvert + +struct RectCoords +{ + int x1, y1, x2, y2; +}; + +#ifdef BORDER_ISOLATED +inline bool isBorder(const struct RectCoords bounds, int2 coord, int numPixels) +{ + return (coord.x < bounds.x1 || coord.y < bounds.y1 || coord.x + numPixels > bounds.x2 || coord.y >= bounds.y2); +} +#else +inline bool isBorder(const struct RectCoords bounds, int2 coord, int numPixels) +{ + return (coord.x < 0 || coord.y < 0 || coord.x + numPixels > bounds.x2 || coord.y >= bounds.y2); +} +#endif + +WT getBorderPixel(const struct RectCoords bounds, int2 coord, + __global const uchar* srcptr, int srcstep) +{ +#ifdef BORDER_CONSTANT + return (WT)(0); +#else + int selected_col = coord.x; + int selected_row = coord.y; + + EXTRAPOLATE(selected_col, selected_row, + bounds.x1, bounds.y1, + bounds.x2, bounds.y2 + ); + + coord = (int2)(selected_col, selected_row); + __global const uchar* ptr = srcptr + mul24(coord.y, srcstep) + + coord.x * SRCSIZE; + return convertToWT(loadpix(ptr)); +#endif +} + +inline WT readSrcPixelSingle(int2 pos, __global const uchar* srcptr, + int srcstep, const struct RectCoords srcCoords) +{ + if (!isBorder(srcCoords, pos, 1)) + { + __global const uchar* ptr = srcptr + mul24(pos.y, srcstep) + + pos.x * SRCSIZE; + + return convertToWT(loadpix(ptr)); + } + else + { + return getBorderPixel(srcCoords, pos, srcptr, srcstep); + } +} + +#define __CAT(x, y) x##y +#define CAT(x, y) __CAT(x, y) + +#define vload1(OFFSET, PTR) (*(PTR + OFFSET)) +#define PX_LOAD_VEC_TYPE CAT(srcT1, PX_LOAD_VEC_SIZE) +#define PX_LOAD_FLOAT_VEC_TYPE CAT(WT1, PX_LOAD_VEC_SIZE) +#define PX_LOAD_FLOAT_VEC_CONV CAT(convert_, PX_LOAD_FLOAT_VEC_TYPE) +#define PX_LOAD CAT(vload, PX_LOAD_VEC_SIZE) +#define float1 float + +inline PX_LOAD_FLOAT_VEC_TYPE readSrcPixelGroup(int2 pos, __global const uchar* srcptr, + int srcstep, const struct RectCoords srcCoords) +{ + __global const srcT1* ptr = (__global const srcT1*) + (srcptr + mul24(pos.y, srcstep) + + pos.x * SRCSIZE); + return PX_LOAD_FLOAT_VEC_CONV(PX_LOAD(0, ptr)); +} + +// Macros to ensure unrolled loops +#define LOOP1(VAR, STMT) (STMT); (VAR)++; +#define LOOP2(VAR, STMT) LOOP1(VAR, STMT); (STMT); (VAR)++; +#define LOOP3(VAR, STMT) LOOP2(VAR, STMT); (STMT); (VAR)++; +#define LOOP4(VAR, STMT) LOOP3(VAR, STMT); (STMT); (VAR)++; +#define LOOP5(VAR, STMT) LOOP4(VAR, STMT); (STMT); (VAR)++; +#define LOOP6(VAR, STMT) LOOP5(VAR, STMT); (STMT); (VAR)++; +#define LOOP7(VAR, STMT) LOOP6(VAR, STMT); (STMT); (VAR)++; +#define LOOP8(VAR, STMT) LOOP7(VAR, STMT); (STMT); (VAR)++; +#define LOOP9(VAR, STMT) LOOP8(VAR, STMT); (STMT); (VAR)++; +#define LOOP10(VAR, STMT) LOOP9(VAR, STMT); (STMT); (VAR)++; +#define LOOP11(VAR, STMT) LOOP10(VAR, STMT); (STMT); (VAR)++; +#define LOOP12(VAR, STMT) LOOP11(VAR, STMT); (STMT); (VAR)++; +#define LOOP13(VAR, STMT) LOOP12(VAR, STMT); (STMT); (VAR)++; + +#define LOOP(N, VAR, STMT) CAT(LOOP, N)((VAR), (STMT)) + +#define DIG(a) a, +__constant WT1 kernelData[] = { COEFF }; + +__kernel void filter2DSmall(__global const uchar * srcptr, int src_step, int srcOffsetX, int srcOffsetY, int srcEndX, int srcEndY, + __global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols, float delta) +{ + const struct RectCoords srcCoords = { srcOffsetX, srcOffsetY, srcEndX, srcEndY }; // for non-isolated border: offsetX, offsetY, wholeX, wholeY + + const int startX = get_global_id(0) * PX_PER_WI_X; + const int startY = get_global_id(1) * PX_PER_WI_Y; + + if ((startX >= cols) || (startY >= rows)) + { + return; + } + + WT privateData[PX_PER_WI_Y + KERNEL_SIZE_Y - 1][PRIV_DATA_WIDTH]; + + // Load all of the pixels needed for the calculation + int py = 0; + LOOP(PX_LOAD_Y_ITERATIONS, py, + { + int y = startY + py; + int px = 0; + LOOP(PX_LOAD_X_ITERATIONS, px, + { + int x = startX + (px * PX_LOAD_NUM_PX); + int2 srcPos = (int2)(srcCoords.x1 + x - ANCHOR_X, srcCoords.y1 + y - ANCHOR_Y); + + if (!isBorder(srcCoords, srcPos, PX_LOAD_NUM_PX)) + { + PX_LOAD_FLOAT_VEC_TYPE p = readSrcPixelGroup(srcPos, srcptr, src_step, srcCoords); + *((PX_LOAD_FLOAT_VEC_TYPE*)&privateData[py][px * PX_LOAD_NUM_PX]) = p; + } + else + { + int lx = 0; + LOOP(PX_LOAD_NUM_PX, lx, + { + WT p = readSrcPixelSingle(srcPos, srcptr, src_step, srcCoords); + *((WT*)&privateData[py][px * PX_LOAD_NUM_PX + lx]) = p; + srcPos.x++; + }); + } + }); + }); + // Use the stored pixels to compute the results + py = 0; + LOOP(PX_PER_WI_Y, py, + { + int y = startY + py; + int px = 0; + LOOP(PX_PER_WI_X, px, + { + int x = startX + px; + WT total_sum = 0; + int sy = 0; + int kernelIndex = 0; + LOOP(KERNEL_SIZE_Y, sy, + { + int sx = 0; + LOOP(KERNEL_SIZE_X, sx, + { + total_sum = mad(kernelData[kernelIndex++], privateData[py + sy][px + sx], total_sum); + }); + }); + + __global dstT* dstPtr = (__global dstT*)(dstptr + y * dst_step + dst_offset + x * DSTSIZE); // Pointer can be out of bounds! + storepix(convertToDstT(total_sum + (WT)(delta)), dstPtr); + }); + }); +} diff --git a/modules/imgproc/test/ocl/test_filter2d.cpp b/modules/imgproc/test/ocl/test_filter2d.cpp index 222990529..7da271802 100644 --- a/modules/imgproc/test/ocl/test_filter2d.cpp +++ b/modules/imgproc/test/ocl/test_filter2d.cpp @@ -51,7 +51,7 @@ namespace ocl { ///////////////////////////////////////////////////////////////////////////////////////////////// // Filter2D -PARAM_TEST_CASE(Filter2D, MatDepth, Channels, BorderType, bool, bool) +PARAM_TEST_CASE(Filter2D, MatDepth, Channels, int, int, BorderType, bool, bool) { static const int kernelMinSize = 2; static const int kernelMaxSize = 10; @@ -60,6 +60,7 @@ PARAM_TEST_CASE(Filter2D, MatDepth, Channels, BorderType, bool, bool) Size dsize; Point anchor; int borderType; + int widthMultiple; bool useRoi; Mat kernel; double delta; @@ -70,27 +71,30 @@ PARAM_TEST_CASE(Filter2D, MatDepth, Channels, BorderType, bool, bool) virtual void SetUp() { type = CV_MAKE_TYPE(GET_PARAM(0), GET_PARAM(1)); - borderType = GET_PARAM(2) | (GET_PARAM(3) ? BORDER_ISOLATED : 0); - useRoi = GET_PARAM(4); + Size ksize(GET_PARAM(2), GET_PARAM(2)); + widthMultiple = GET_PARAM(3); + borderType = GET_PARAM(4) | (GET_PARAM(5) ? BORDER_ISOLATED : 0); + useRoi = GET_PARAM(6); + Mat temp = randomMat(ksize, CV_MAKE_TYPE(((CV_64F == CV_MAT_DEPTH(type)) ? CV_64F : CV_32F), 1), -MAX_VALUE, MAX_VALUE); + cv::normalize(temp, kernel, 1.0, 0.0, NORM_L1); } void random_roi() { dsize = randomSize(1, MAX_VALUE); + // Make sure the width is a multiple of the requested value, and no more. + dsize.width &= ~((widthMultiple * 2) - 1); + dsize.width += widthMultiple; - Size ksize = randomSize(kernelMinSize, kernelMaxSize); - Mat temp = randomMat(ksize, CV_MAKE_TYPE(((CV_64F == CV_MAT_DEPTH(type)) ? CV_64F : CV_32F), 1), -MAX_VALUE, MAX_VALUE); - cv::normalize(temp, kernel, 1.0, 0.0, NORM_L1); - - Size roiSize = randomSize(ksize.width, MAX_VALUE, ksize.height, MAX_VALUE); + Size roiSize = randomSize(kernel.size[0], MAX_VALUE, kernel.size[1], MAX_VALUE); Border srcBorder = randomBorder(0, useRoi ? MAX_VALUE : 0); randomSubMat(src, src_roi, roiSize, srcBorder, type, -MAX_VALUE, MAX_VALUE); Border dstBorder = randomBorder(0, useRoi ? MAX_VALUE : 0); randomSubMat(dst, dst_roi, dsize, dstBorder, type, -MAX_VALUE, MAX_VALUE); - anchor.x = randomInt(-1, ksize.width); - anchor.y = randomInt(-1, ksize.height); + anchor.x = randomInt(-1, kernel.size[0]); + anchor.y = randomInt(-1, kernel.size[1]); delta = randomDouble(-100, 100); @@ -122,6 +126,8 @@ OCL_INSTANTIATE_TEST_CASE_P(ImageProc, Filter2D, Combine( Values(CV_8U, CV_16U, CV_32F), OCL_ALL_CHANNELS, + Values(3, 5, 9), // Kernel size + Values(1, 4, 8), // Width mutiple Values((BorderType)BORDER_CONSTANT, (BorderType)BORDER_REPLICATE, (BorderType)BORDER_REFLECT,