Merge pull request #2009 from krodyush:pullreq/2.4-opt-131211-sepFilterSinglePass_final
This commit is contained in:
		@@ -287,7 +287,7 @@ ocl::createSeparableLinearFilter_GPU
 | 
			
		||||
----------------------------------------
 | 
			
		||||
Creates a separable linear filter engine.
 | 
			
		||||
 | 
			
		||||
.. ocv:function:: Ptr<FilterEngine_GPU> ocl::createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat &rowKernel, const Mat &columnKernel, const Point &anchor = Point(-1, -1), double delta = 0.0, int bordertype = BORDER_DEFAULT)
 | 
			
		||||
.. ocv:function:: Ptr<FilterEngine_GPU> ocl::createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat &rowKernel, const Mat &columnKernel, const Point &anchor = Point(-1, -1), double delta = 0.0, int bordertype = BORDER_DEFAULT, Size imgSize = Size(-1,-1) )
 | 
			
		||||
 | 
			
		||||
    :param srcType: Source array type.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
 | 
			
		||||
 | 
			
		||||
@@ -303,6 +303,8 @@ Creates a separable linear filter engine.
 | 
			
		||||
 | 
			
		||||
    :param bordertype: Pixel extrapolation method.
 | 
			
		||||
 | 
			
		||||
    :param imgSize: Source image size to choose optimal method for processing.
 | 
			
		||||
 | 
			
		||||
.. seealso:: :ocv:func:`ocl::getLinearRowFilter_GPU`, :ocv:func:`ocl::getLinearColumnFilter_GPU`, :ocv:func:`createSeparableLinearFilter`
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@@ -334,7 +336,7 @@ ocl::createDerivFilter_GPU
 | 
			
		||||
------------------------------
 | 
			
		||||
Creates a filter engine for the generalized Sobel operator.
 | 
			
		||||
 | 
			
		||||
.. ocv:function:: Ptr<FilterEngine_GPU> ocl::createDerivFilter_GPU( int srcType, int dstType, int dx, int dy, int ksize, int borderType = BORDER_DEFAULT )
 | 
			
		||||
.. ocv:function:: Ptr<FilterEngine_GPU> ocl::createDerivFilter_GPU( int srcType, int dstType, int dx, int dy, int ksize, int borderType = BORDER_DEFAULT, Size imgSize = Size(-1,-1) )
 | 
			
		||||
 | 
			
		||||
    :param srcType: Source image type.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
 | 
			
		||||
 | 
			
		||||
@@ -348,6 +350,8 @@ Creates a filter engine for the generalized Sobel operator.
 | 
			
		||||
 | 
			
		||||
    :param borderType: Pixel extrapolation method. For details, see  :ocv:func:`borderInterpolate`.
 | 
			
		||||
 | 
			
		||||
    :param imgSize: Source image size to choose optimal method for processing.
 | 
			
		||||
 | 
			
		||||
.. seealso:: :ocv:func:`ocl::createSeparableLinearFilter_GPU`, :ocv:func:`createDerivFilter`
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@@ -405,7 +409,7 @@ ocl::createGaussianFilter_GPU
 | 
			
		||||
---------------------------------
 | 
			
		||||
Creates a Gaussian filter engine.
 | 
			
		||||
 | 
			
		||||
.. ocv:function:: Ptr<FilterEngine_GPU> ocl::createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0, int bordertype = BORDER_DEFAULT)
 | 
			
		||||
.. ocv:function:: Ptr<FilterEngine_GPU> ocl::createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0, int bordertype = BORDER_DEFAULT, Size imgSize = Size(-1,-1) )
 | 
			
		||||
 | 
			
		||||
    :param type: Source and destination image type.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1`` are supported.
 | 
			
		||||
 | 
			
		||||
@@ -417,6 +421,8 @@ Creates a Gaussian filter engine.
 | 
			
		||||
 | 
			
		||||
    :param bordertype: Pixel extrapolation method. For details, see  :ocv:func:`borderInterpolate`.
 | 
			
		||||
 | 
			
		||||
    :param imgSize: Source image size to choose optimal method for processing.
 | 
			
		||||
 | 
			
		||||
.. seealso:: :ocv:func:`ocl::createSeparableLinearFilter_GPU`, :ocv:func:`createGaussianFilter`
 | 
			
		||||
 | 
			
		||||
ocl::GaussianBlur
 | 
			
		||||
 
 | 
			
		||||
@@ -706,17 +706,17 @@ namespace cv
 | 
			
		||||
 | 
			
		||||
        //! returns the separable linear filter engine
 | 
			
		||||
        CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat &rowKernel,
 | 
			
		||||
                const Mat &columnKernel, const Point &anchor = Point(-1, -1), double delta = 0.0, int bordertype = BORDER_DEFAULT);
 | 
			
		||||
                const Mat &columnKernel, const Point &anchor = Point(-1, -1), double delta = 0.0, int bordertype = BORDER_DEFAULT, Size imgSize = Size(-1,-1));
 | 
			
		||||
 | 
			
		||||
        //! returns the separable filter engine with the specified filters
 | 
			
		||||
        CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU> &rowFilter,
 | 
			
		||||
                const Ptr<BaseColumnFilter_GPU> &columnFilter);
 | 
			
		||||
 | 
			
		||||
        //! returns the Gaussian filter engine
 | 
			
		||||
        CV_EXPORTS Ptr<FilterEngine_GPU> createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0, int bordertype = BORDER_DEFAULT);
 | 
			
		||||
        CV_EXPORTS Ptr<FilterEngine_GPU> createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0, int bordertype = BORDER_DEFAULT, Size imgSize = Size(-1,-1));
 | 
			
		||||
 | 
			
		||||
        //! returns filter engine for the generalized Sobel operator
 | 
			
		||||
        CV_EXPORTS Ptr<FilterEngine_GPU> createDerivFilter_GPU( int srcType, int dstType, int dx, int dy, int ksize, int borderType = BORDER_DEFAULT );
 | 
			
		||||
        CV_EXPORTS Ptr<FilterEngine_GPU> createDerivFilter_GPU( int srcType, int dstType, int dx, int dy, int ksize, int borderType = BORDER_DEFAULT, Size imgSize = Size(-1,-1) );
 | 
			
		||||
 | 
			
		||||
        //! applies Laplacian operator to the image
 | 
			
		||||
        // supports only ksize = 1 and ksize = 3
 | 
			
		||||
@@ -869,7 +869,6 @@ namespace cv
 | 
			
		||||
        CV_EXPORTS void cornerMinEigenVal(const oclMat &src, oclMat &dst, int blockSize, int ksize, int bordertype = cv::BORDER_DEFAULT);
 | 
			
		||||
        CV_EXPORTS void cornerMinEigenVal_dxdy(const oclMat &src, oclMat &dst, oclMat &Dx, oclMat &Dy,
 | 
			
		||||
            int blockSize, int ksize, int bordertype = cv::BORDER_DEFAULT);
 | 
			
		||||
 | 
			
		||||
        /////////////////////////////////// ML ///////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
        //! Compute closest centers for each lines in source and lable it after center's index
 | 
			
		||||
 
 | 
			
		||||
@@ -739,6 +739,135 @@ void cv::ocl::filter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &ke
 | 
			
		||||
    f->apply(src, dst);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
const int optimizedSepFilterLocalSize = 16;
 | 
			
		||||
static void sepFilter2D_SinglePass(const oclMat &src, oclMat &dst,
 | 
			
		||||
                                   const Mat &row_kernel, const Mat &col_kernel, int bordertype = BORDER_DEFAULT)
 | 
			
		||||
{
 | 
			
		||||
    size_t lt2[3] = {optimizedSepFilterLocalSize, optimizedSepFilterLocalSize, 1};
 | 
			
		||||
    size_t gt2[3] = {lt2[0]*(1 + (src.cols-1) / lt2[0]), lt2[1]*(1 + (src.rows-1) / lt2[1]), 1};
 | 
			
		||||
 | 
			
		||||
    unsigned int src_pitch = src.step;
 | 
			
		||||
    unsigned int dst_pitch = dst.step;
 | 
			
		||||
 | 
			
		||||
    int src_offset_x = (src.offset % src.step) / src.elemSize();
 | 
			
		||||
    int src_offset_y = src.offset / src.step;
 | 
			
		||||
 | 
			
		||||
    std::vector<std::pair<size_t , const void *> > args;
 | 
			
		||||
    args.push_back( std::make_pair( sizeof(cl_mem)  , (void *)&src.data ));
 | 
			
		||||
    args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&src_pitch ));
 | 
			
		||||
 | 
			
		||||
    args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&src_offset_x ));
 | 
			
		||||
    args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&src_offset_y ));
 | 
			
		||||
 | 
			
		||||
    args.push_back( std::make_pair( sizeof(cl_mem)  , (void *)&dst.data ));
 | 
			
		||||
    args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&dst.offset ));
 | 
			
		||||
    args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&dst_pitch ));
 | 
			
		||||
 | 
			
		||||
    args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&src.wholecols ));
 | 
			
		||||
    args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&src.wholerows ));
 | 
			
		||||
 | 
			
		||||
    args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&dst.cols ));
 | 
			
		||||
    args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&dst.rows ));
 | 
			
		||||
 | 
			
		||||
    string option = cv::format("-D BLK_X=%d -D BLK_Y=%d -D RADIUSX=%d -D RADIUSY=%d",(int)lt2[0], (int)lt2[1],
 | 
			
		||||
        row_kernel.rows / 2, col_kernel.rows / 2 );
 | 
			
		||||
 | 
			
		||||
    option += " -D KERNEL_MATRIX_X=";
 | 
			
		||||
    for(int i=0; i<row_kernel.rows; i++)
 | 
			
		||||
        option += cv::format("0x%x,", *reinterpret_cast<const unsigned int*>( &row_kernel.at<float>(i) ) );
 | 
			
		||||
    option += "0x0";
 | 
			
		||||
 | 
			
		||||
    option += " -D KERNEL_MATRIX_Y=";
 | 
			
		||||
    for(int i=0; i<col_kernel.rows; i++)
 | 
			
		||||
        option += cv::format("0x%x,", *reinterpret_cast<const unsigned int*>( &col_kernel.at<float>(i) ) );
 | 
			
		||||
    option += "0x0";
 | 
			
		||||
 | 
			
		||||
    switch(src.type())
 | 
			
		||||
    {
 | 
			
		||||
    case CV_8UC1:
 | 
			
		||||
        option += " -D SRCTYPE=uchar -D CONVERT_SRCTYPE=convert_float -D WORKTYPE=float";
 | 
			
		||||
        break;
 | 
			
		||||
    case CV_32FC1:
 | 
			
		||||
        option += " -D SRCTYPE=float -D CONVERT_SRCTYPE= -D WORKTYPE=float";
 | 
			
		||||
        break;
 | 
			
		||||
    case CV_8UC2:
 | 
			
		||||
        option += " -D SRCTYPE=uchar2 -D CONVERT_SRCTYPE=convert_float2 -D WORKTYPE=float2";
 | 
			
		||||
        break;
 | 
			
		||||
    case CV_32FC2:
 | 
			
		||||
        option += " -D SRCTYPE=float2 -D CONVERT_SRCTYPE= -D WORKTYPE=float2";
 | 
			
		||||
        break;
 | 
			
		||||
    case CV_8UC3:
 | 
			
		||||
        option += " -D SRCTYPE=uchar3 -D CONVERT_SRCTYPE=convert_float3 -D WORKTYPE=float3";
 | 
			
		||||
        break;
 | 
			
		||||
    case CV_32FC3:
 | 
			
		||||
        option += " -D SRCTYPE=float3 -D CONVERT_SRCTYPE= -D WORKTYPE=float3";
 | 
			
		||||
        break;
 | 
			
		||||
    case CV_8UC4:
 | 
			
		||||
        option += " -D SRCTYPE=uchar4 -D CONVERT_SRCTYPE=convert_float4 -D WORKTYPE=float4";
 | 
			
		||||
        break;
 | 
			
		||||
    case CV_32FC4:
 | 
			
		||||
        option += " -D SRCTYPE=float4 -D CONVERT_SRCTYPE= -D WORKTYPE=float4";
 | 
			
		||||
        break;
 | 
			
		||||
    default:
 | 
			
		||||
        CV_Error(CV_StsUnsupportedFormat, "Image type is not supported!");
 | 
			
		||||
        break;
 | 
			
		||||
    }
 | 
			
		||||
    switch(dst.type())
 | 
			
		||||
    {
 | 
			
		||||
    case CV_8UC1:
 | 
			
		||||
        option += " -D DSTTYPE=uchar -D CONVERT_DSTTYPE=convert_uchar_sat";
 | 
			
		||||
        break;
 | 
			
		||||
    case CV_8UC2:
 | 
			
		||||
        option += " -D DSTTYPE=uchar2 -D CONVERT_DSTTYPE=convert_uchar2_sat";
 | 
			
		||||
        break;
 | 
			
		||||
    case CV_8UC3:
 | 
			
		||||
        option += " -D DSTTYPE=uchar3 -D CONVERT_DSTTYPE=convert_uchar3_sat";
 | 
			
		||||
        break;
 | 
			
		||||
    case CV_8UC4:
 | 
			
		||||
        option += " -D DSTTYPE=uchar4 -D CONVERT_DSTTYPE=convert_uchar4_sat";
 | 
			
		||||
        break;
 | 
			
		||||
    case CV_32FC1:
 | 
			
		||||
        option += " -D DSTTYPE=float -D CONVERT_DSTTYPE=";
 | 
			
		||||
        break;
 | 
			
		||||
    case CV_32FC2:
 | 
			
		||||
        option += " -D DSTTYPE=float2 -D CONVERT_DSTTYPE=";
 | 
			
		||||
        break;
 | 
			
		||||
    case CV_32FC3:
 | 
			
		||||
        option += " -D DSTTYPE=float3 -D CONVERT_DSTTYPE=";
 | 
			
		||||
        break;
 | 
			
		||||
    case CV_32FC4:
 | 
			
		||||
        option += " -D DSTTYPE=float4 -D CONVERT_DSTTYPE=";
 | 
			
		||||
        break;
 | 
			
		||||
    default:
 | 
			
		||||
        CV_Error(CV_StsUnsupportedFormat, "Image type is not supported!");
 | 
			
		||||
        break;
 | 
			
		||||
    }
 | 
			
		||||
    switch(bordertype)
 | 
			
		||||
    {
 | 
			
		||||
    case cv::BORDER_CONSTANT:
 | 
			
		||||
        option += " -D BORDER_CONSTANT";
 | 
			
		||||
        break;
 | 
			
		||||
    case cv::BORDER_REPLICATE:
 | 
			
		||||
        option += " -D BORDER_REPLICATE";
 | 
			
		||||
        break;
 | 
			
		||||
    case cv::BORDER_REFLECT:
 | 
			
		||||
        option += " -D BORDER_REFLECT";
 | 
			
		||||
        break;
 | 
			
		||||
    case cv::BORDER_REFLECT101:
 | 
			
		||||
        option += " -D BORDER_REFLECT_101";
 | 
			
		||||
        break;
 | 
			
		||||
    case cv::BORDER_WRAP:
 | 
			
		||||
        option += " -D BORDER_WRAP";
 | 
			
		||||
        break;
 | 
			
		||||
    default:
 | 
			
		||||
        CV_Error(CV_StsBadFlag, "BORDER type is not supported!");
 | 
			
		||||
        break;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    openCLExecuteKernel(src.clCxt, &filtering_sep_filter_singlepass, "sep_filter_singlepass", gt2, lt2, args,
 | 
			
		||||
        -1, -1, option.c_str() );
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// SeparableFilter
 | 
			
		||||
 | 
			
		||||
@@ -788,6 +917,35 @@ Ptr<FilterEngine_GPU> cv::ocl::createSeparableFilter_GPU(const Ptr<BaseRowFilter
 | 
			
		||||
    return Ptr<FilterEngine_GPU>(new SeparableFilterEngine_GPU(rowFilter, columnFilter));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
namespace
 | 
			
		||||
{
 | 
			
		||||
class SingleStepSeparableFilterEngine_GPU : public FilterEngine_GPU
 | 
			
		||||
{
 | 
			
		||||
public:
 | 
			
		||||
    SingleStepSeparableFilterEngine_GPU( const Mat &rowKernel_, const Mat &columnKernel_, const int btype )
 | 
			
		||||
    {
 | 
			
		||||
        bordertype = btype;
 | 
			
		||||
        rowKernel = rowKernel_;
 | 
			
		||||
        columnKernel = columnKernel_;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    virtual void apply(const oclMat &src, oclMat &dst, Rect roi = Rect(0, 0, -1, -1))
 | 
			
		||||
    {
 | 
			
		||||
        normalizeROI(roi, Size(rowKernel.rows, columnKernel.rows), Point(-1,-1), src.size());
 | 
			
		||||
 | 
			
		||||
        oclMat srcROI = src(roi);
 | 
			
		||||
        oclMat dstROI = dst(roi);
 | 
			
		||||
 | 
			
		||||
        sepFilter2D_SinglePass(src, dst, rowKernel, columnKernel, bordertype);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    Mat rowKernel;
 | 
			
		||||
    Mat columnKernel;
 | 
			
		||||
    int bordertype;
 | 
			
		||||
};
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
static void GPUFilterBox(const oclMat &src, oclMat &dst,
 | 
			
		||||
                         Size &ksize, const Point anchor, const int borderType)
 | 
			
		||||
{
 | 
			
		||||
@@ -1241,17 +1399,32 @@ Ptr<BaseColumnFilter_GPU> cv::ocl::getLinearColumnFilter_GPU(int /*bufType*/, in
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
Ptr<FilterEngine_GPU> cv::ocl::createSeparableLinearFilter_GPU(int srcType, int dstType,
 | 
			
		||||
        const Mat &rowKernel, const Mat &columnKernel, const Point &anchor, double delta, int bordertype)
 | 
			
		||||
        const Mat &rowKernel, const Mat &columnKernel, const Point &anchor, double delta, int bordertype, Size imgSize )
 | 
			
		||||
{
 | 
			
		||||
    int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(dstType);
 | 
			
		||||
    int cn = CV_MAT_CN(srcType);
 | 
			
		||||
    int bdepth = std::max(std::max(sdepth, ddepth), CV_32F);
 | 
			
		||||
    int bufType = CV_MAKETYPE(bdepth, cn);
 | 
			
		||||
    Context* clCxt = Context::getContext();
 | 
			
		||||
 | 
			
		||||
    //if image size is non-degenerate and large enough
 | 
			
		||||
    //and if filter support is reasonable to satisfy larger local memory requirements,
 | 
			
		||||
    //then we can use single pass routine to avoid extra runtime calls overhead
 | 
			
		||||
    if( clCxt && clCxt->supportsFeature(FEATURE_CL_INTEL_DEVICE) &&
 | 
			
		||||
        rowKernel.rows <= 21 && columnKernel.rows <= 21 &&
 | 
			
		||||
        (rowKernel.rows & 1) == 1 && (columnKernel.rows & 1) == 1 &&
 | 
			
		||||
        imgSize.width > optimizedSepFilterLocalSize + (rowKernel.rows>>1) &&
 | 
			
		||||
        imgSize.height > optimizedSepFilterLocalSize + (columnKernel.rows>>1) )
 | 
			
		||||
    {
 | 
			
		||||
        return Ptr<FilterEngine_GPU>(new SingleStepSeparableFilterEngine_GPU(rowKernel, columnKernel, bordertype));
 | 
			
		||||
    }
 | 
			
		||||
    else
 | 
			
		||||
    {
 | 
			
		||||
        Ptr<BaseRowFilter_GPU> rowFilter = getLinearRowFilter_GPU(srcType, bufType, rowKernel, anchor.x, bordertype);
 | 
			
		||||
        Ptr<BaseColumnFilter_GPU> columnFilter = getLinearColumnFilter_GPU(bufType, dstType, columnKernel, anchor.y, bordertype, delta);
 | 
			
		||||
 | 
			
		||||
        return createSeparableFilter_GPU(rowFilter, columnFilter);
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void cv::ocl::sepFilter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &kernelX, const Mat &kernelY, Point anchor, double delta, int bordertype)
 | 
			
		||||
@@ -1275,16 +1448,16 @@ void cv::ocl::sepFilter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat
 | 
			
		||||
 | 
			
		||||
    dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels()));
 | 
			
		||||
 | 
			
		||||
    Ptr<FilterEngine_GPU> f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, anchor, delta, bordertype);
 | 
			
		||||
    Ptr<FilterEngine_GPU> f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, anchor, delta, bordertype, src.size());
 | 
			
		||||
    f->apply(src, dst);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
Ptr<FilterEngine_GPU> cv::ocl::createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, int borderType)
 | 
			
		||||
Ptr<FilterEngine_GPU> cv::ocl::createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, int borderType, Size imgSize )
 | 
			
		||||
{
 | 
			
		||||
    Mat kx, ky;
 | 
			
		||||
    getDerivKernels(kx, ky, dx, dy, ksize, false, CV_32F);
 | 
			
		||||
    return createSeparableLinearFilter_GPU(srcType, dstType,
 | 
			
		||||
                                           kx, ky, Point(-1, -1), 0, borderType);
 | 
			
		||||
                                           kx, ky, Point(-1, -1), 0, borderType, imgSize);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
@@ -1354,7 +1527,7 @@ void cv::ocl::Laplacian(const oclMat &src, oclMat &dst, int ddepth, int ksize, d
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Gaussian Filter
 | 
			
		||||
 | 
			
		||||
Ptr<FilterEngine_GPU> cv::ocl::createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2, int bordertype)
 | 
			
		||||
Ptr<FilterEngine_GPU> cv::ocl::createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2, int bordertype, Size imgSize)
 | 
			
		||||
{
 | 
			
		||||
    int depth = CV_MAT_DEPTH(type);
 | 
			
		||||
 | 
			
		||||
@@ -1381,7 +1554,7 @@ Ptr<FilterEngine_GPU> cv::ocl::createGaussianFilter_GPU(int type, Size ksize, do
 | 
			
		||||
    else
 | 
			
		||||
        ky = getGaussianKernel(ksize.height, sigma2, std::max(depth, CV_32F));
 | 
			
		||||
 | 
			
		||||
    return createSeparableLinearFilter_GPU(type, type, kx, ky, Point(-1, -1), 0.0, bordertype);
 | 
			
		||||
    return createSeparableLinearFilter_GPU(type, type, kx, ky, Point(-1, -1), 0.0, bordertype, imgSize);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void cv::ocl::GaussianBlur(const oclMat &src, oclMat &dst, Size ksize, double sigma1, double sigma2, int bordertype)
 | 
			
		||||
@@ -1417,7 +1590,7 @@ void cv::ocl::GaussianBlur(const oclMat &src, oclMat &dst, Size ksize, double si
 | 
			
		||||
 | 
			
		||||
    dst.create(src.size(), src.type());
 | 
			
		||||
 | 
			
		||||
    Ptr<FilterEngine_GPU> f = createGaussianFilter_GPU(src.type(), ksize, sigma1, sigma2, bordertype);
 | 
			
		||||
    Ptr<FilterEngine_GPU> f = createGaussianFilter_GPU(src.type(), ksize, sigma1, sigma2, bordertype, src.size());
 | 
			
		||||
    f->apply(src, dst);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										185
									
								
								modules/ocl/src/opencl/filtering_sep_filter_singlepass.cl
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										185
									
								
								modules/ocl/src/opencl/filtering_sep_filter_singlepass.cl
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,185 @@
 | 
			
		||||
/*M///////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
//
 | 
			
		||||
//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 | 
			
		||||
//
 | 
			
		||||
//  By downloading, copying, installing or using the software you agree to this license.
 | 
			
		||||
//  If you do not agree to this license, do not download, install,
 | 
			
		||||
//  copy or use the software.
 | 
			
		||||
//
 | 
			
		||||
//
 | 
			
		||||
//                           License Agreement
 | 
			
		||||
//                For Open Source Computer Vision Library
 | 
			
		||||
//
 | 
			
		||||
// Copyright (C) 2013, Intel Corporation, all rights reserved.
 | 
			
		||||
// Third party copyrights are property of their respective owners.
 | 
			
		||||
//
 | 
			
		||||
// Redistribution and use in source and binary forms, with or without modification,
 | 
			
		||||
// are permitted provided that the following conditions are met:
 | 
			
		||||
//
 | 
			
		||||
//   * Redistribution's of source code must retain the above copyright notice,
 | 
			
		||||
//     this list of conditions and the following disclaimer.
 | 
			
		||||
//
 | 
			
		||||
//   * Redistribution's in binary form must reproduce the above copyright notice,
 | 
			
		||||
//     this list of conditions and the following disclaimer in the documentation
 | 
			
		||||
//     and/or other materials provided with the distribution.
 | 
			
		||||
//
 | 
			
		||||
//   * The name of the copyright holders may not be used to endorse or promote products
 | 
			
		||||
//     derived from this software without specific prior written permission.
 | 
			
		||||
//
 | 
			
		||||
// This software is provided by the copyright holders and contributors "as is" and
 | 
			
		||||
// any express or implied warranties, including, but not limited to, the implied
 | 
			
		||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
 | 
			
		||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
 | 
			
		||||
// indirect, incidental, special, exemplary, or consequential damages
 | 
			
		||||
// (including, but not limited to, procurement of substitute goods or services;
 | 
			
		||||
// loss of use, data, or profits; or business interruption) however caused
 | 
			
		||||
// and on any theory of liability, whether in contract, strict liability,
 | 
			
		||||
// or tort (including negligence or otherwise) arising in any way out of
 | 
			
		||||
// the use of this software, even if advised of the possibility of such damage.
 | 
			
		||||
//
 | 
			
		||||
//M*/
 | 
			
		||||
///////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
/////////////////////////////////Macro for border type////////////////////////////////////////////
 | 
			
		||||
/////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
#ifdef BORDER_CONSTANT
 | 
			
		||||
//CCCCCC|abcdefgh|CCCCCCC
 | 
			
		||||
#define EXTRAPOLATE(x, maxV)
 | 
			
		||||
#elif defined BORDER_REPLICATE
 | 
			
		||||
//aaaaaa|abcdefgh|hhhhhhh
 | 
			
		||||
#define EXTRAPOLATE(x, maxV) \
 | 
			
		||||
    { \
 | 
			
		||||
        (x) = max(min((x), (maxV) - 1), 0); \
 | 
			
		||||
    }
 | 
			
		||||
#elif defined BORDER_WRAP
 | 
			
		||||
//cdefgh|abcdefgh|abcdefg
 | 
			
		||||
#define EXTRAPOLATE(x, maxV) \
 | 
			
		||||
    { \
 | 
			
		||||
        (x) = ( (x) + (maxV) ) % (maxV); \
 | 
			
		||||
    }
 | 
			
		||||
#elif defined BORDER_REFLECT
 | 
			
		||||
//fedcba|abcdefgh|hgfedcb
 | 
			
		||||
#define EXTRAPOLATE(x, maxV) \
 | 
			
		||||
    { \
 | 
			
		||||
        (x) = min(((maxV)-1)*2-(x)+1, max((x),-(x)-1) ); \
 | 
			
		||||
    }
 | 
			
		||||
#elif defined BORDER_REFLECT_101
 | 
			
		||||
//gfedcb|abcdefgh|gfedcba
 | 
			
		||||
#define EXTRAPOLATE(x, maxV) \
 | 
			
		||||
    { \
 | 
			
		||||
        (x) = min(((maxV)-1)*2-(x), max((x),-(x)) ); \
 | 
			
		||||
    }
 | 
			
		||||
#else
 | 
			
		||||
#error No extrapolation method
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#define SRC(_x,_y) CONVERT_SRCTYPE(((global SRCTYPE*)(Src+(_y)*SrcPitch))[_x])
 | 
			
		||||
 | 
			
		||||
#ifdef BORDER_CONSTANT
 | 
			
		||||
//CCCCCC|abcdefgh|CCCCCCC
 | 
			
		||||
#define ELEM(_x,_y,r_edge,t_edge,const_v) (_x)<0 | (_x) >= (r_edge) | (_y)<0 | (_y) >= (t_edge) ? (const_v) : SRC((_x),(_y))
 | 
			
		||||
#else
 | 
			
		||||
#define ELEM(_x,_y,r_edge,t_edge,const_v) SRC((_x),(_y))
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#define DST(_x,_y) (((global DSTTYPE*)(Dst+DstOffset+(_y)*DstPitch))[_x])
 | 
			
		||||
 | 
			
		||||
//horizontal and vertical filter kernels
 | 
			
		||||
//should be defined on host during compile time to avoid overhead
 | 
			
		||||
__constant uint mat_kernelX[] = {KERNEL_MATRIX_X};
 | 
			
		||||
__constant uint mat_kernelY[] = {KERNEL_MATRIX_Y};
 | 
			
		||||
 | 
			
		||||
__kernel __attribute__((reqd_work_group_size(BLK_X,BLK_Y,1))) void sep_filter_singlepass
 | 
			
		||||
        (
 | 
			
		||||
        __global uchar* Src,
 | 
			
		||||
        const uint      SrcPitch,
 | 
			
		||||
        const int       srcOffsetX,
 | 
			
		||||
        const int       srcOffsetY,
 | 
			
		||||
        __global uchar* Dst,
 | 
			
		||||
        const int       DstOffset,
 | 
			
		||||
        const uint      DstPitch,
 | 
			
		||||
        int             width,
 | 
			
		||||
        int             height,
 | 
			
		||||
        int             dstWidth,
 | 
			
		||||
        int             dstHeight
 | 
			
		||||
        )
 | 
			
		||||
{
 | 
			
		||||
    //RADIUSX, RADIUSY are filter dimensions
 | 
			
		||||
    //BLK_X, BLK_Y are local wrogroup sizes
 | 
			
		||||
    //all these should be defined on host during compile time
 | 
			
		||||
    //first lsmem array for source pixels used in first pass,
 | 
			
		||||
    //second lsmemDy for storing first pass results
 | 
			
		||||
    __local WORKTYPE lsmem[BLK_Y+2*RADIUSY][BLK_X+2*RADIUSX];
 | 
			
		||||
    __local WORKTYPE lsmemDy[BLK_Y][BLK_X+2*RADIUSX];
 | 
			
		||||
 | 
			
		||||
    //get local and global ids - used as image and local memory array indexes
 | 
			
		||||
    int lix = get_local_id(0);
 | 
			
		||||
    int liy = get_local_id(1);
 | 
			
		||||
 | 
			
		||||
    int x = (int)get_global_id(0);
 | 
			
		||||
    int y = (int)get_global_id(1);
 | 
			
		||||
 | 
			
		||||
    //calculate pixel position in source image taking image offset into account
 | 
			
		||||
    int srcX = x + srcOffsetX - RADIUSX;
 | 
			
		||||
    int srcY = y + srcOffsetY - RADIUSY;
 | 
			
		||||
    int xb = srcX;
 | 
			
		||||
    int yb = srcY;
 | 
			
		||||
 | 
			
		||||
    //extrapolate coordinates, if needed
 | 
			
		||||
    //and read my own source pixel into local memory
 | 
			
		||||
    //with account for extra border pixels, which will be read by starting workitems
 | 
			
		||||
    int clocY = liy;
 | 
			
		||||
    int cSrcY = srcY;
 | 
			
		||||
    do
 | 
			
		||||
    {
 | 
			
		||||
        int yb = cSrcY;
 | 
			
		||||
        EXTRAPOLATE(yb, (height));
 | 
			
		||||
 | 
			
		||||
        int clocX = lix;
 | 
			
		||||
        int cSrcX = srcX;
 | 
			
		||||
        do
 | 
			
		||||
        {
 | 
			
		||||
            int xb = cSrcX;
 | 
			
		||||
            EXTRAPOLATE(xb,(width));
 | 
			
		||||
            lsmem[clocY][clocX] = ELEM(xb, yb, (width), (height), 0 );
 | 
			
		||||
 | 
			
		||||
            clocX += BLK_X;
 | 
			
		||||
            cSrcX += BLK_X;
 | 
			
		||||
        }
 | 
			
		||||
        while(clocX < BLK_X+(RADIUSX*2));
 | 
			
		||||
 | 
			
		||||
        clocY += BLK_Y;
 | 
			
		||||
        cSrcY += BLK_Y;
 | 
			
		||||
    }
 | 
			
		||||
    while(clocY < BLK_Y+(RADIUSY*2));
 | 
			
		||||
    barrier(CLK_LOCAL_MEM_FENCE);
 | 
			
		||||
 | 
			
		||||
    //do vertical filter pass
 | 
			
		||||
    //and store intermediate results to second local memory array
 | 
			
		||||
    int i;
 | 
			
		||||
    WORKTYPE sum = 0.0f;
 | 
			
		||||
    int clocX = lix;
 | 
			
		||||
    do
 | 
			
		||||
    {
 | 
			
		||||
        sum = 0.0f;
 | 
			
		||||
        for(i=0; i<=2*RADIUSY; i++)
 | 
			
		||||
            sum = mad(lsmem[liy+i][clocX], as_float(mat_kernelY[i]), sum);
 | 
			
		||||
        lsmemDy[liy][clocX] = sum;
 | 
			
		||||
        clocX += BLK_X;
 | 
			
		||||
    }
 | 
			
		||||
    while(clocX < BLK_X+(RADIUSX*2));
 | 
			
		||||
    barrier(CLK_LOCAL_MEM_FENCE);
 | 
			
		||||
 | 
			
		||||
    //if this pixel happened to be out of image borders because of global size rounding,
 | 
			
		||||
    //then just return
 | 
			
		||||
    if( x >= dstWidth || y >=dstHeight )  return;
 | 
			
		||||
 | 
			
		||||
    //do second horizontal filter pass
 | 
			
		||||
    //and calculate final result
 | 
			
		||||
    sum = 0.0f;
 | 
			
		||||
    for(i=0; i<=2*RADIUSX; i++)
 | 
			
		||||
        sum = mad(lsmemDy[liy][lix+i], as_float(mat_kernelX[i]), sum);
 | 
			
		||||
 | 
			
		||||
    //store result into destination image
 | 
			
		||||
    DST(x,y) = CONVERT_DSTTYPE(sum);
 | 
			
		||||
}
 | 
			
		||||
		Reference in New Issue
	
	Block a user