moved mulSpectrums, dft and convolve to gpuarithm

2013-04-09 15:49:56 +04:00
parent c56bdbc1c5
commit d569e72ad4
15 changed files with 1014 additions and 993 deletions
--- a/modules/gpu/src/cuda/imgproc.cu
+++ b/modules/gpu/src/cuda/imgproc.cu
@@ -582,119 +582,6 @@ namespace cv { namespace gpu { namespace cudev
                cudaSafeCall(cudaDeviceSynchronize());
        }

-        //////////////////////////////////////////////////////////////////////////
-        // mulSpectrums
-
-        __global__ void mulSpectrumsKernel(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x < c.cols && y < c.rows)
-            {
-                c.ptr(y)[x] = cuCmulf(a.ptr(y)[x], b.ptr(y)[x]);
-            }
-        }
-
-
-        void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c, cudaStream_t stream)
-        {
-            dim3 threads(256);
-            dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
-
-            mulSpectrumsKernel<<<grid, threads, 0, stream>>>(a, b, c);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-
-        //////////////////////////////////////////////////////////////////////////
-        // mulSpectrums_CONJ
-
-        __global__ void mulSpectrumsKernel_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x < c.cols && y < c.rows)
-            {
-                c.ptr(y)[x] = cuCmulf(a.ptr(y)[x], cuConjf(b.ptr(y)[x]));
-            }
-        }
-
-
-        void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c, cudaStream_t stream)
-        {
-            dim3 threads(256);
-            dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
-
-            mulSpectrumsKernel_CONJ<<<grid, threads, 0, stream>>>(a, b, c);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-
-        //////////////////////////////////////////////////////////////////////////
-        // mulAndScaleSpectrums
-
-        __global__ void mulAndScaleSpectrumsKernel(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x < c.cols && y < c.rows)
-            {
-                cufftComplex v = cuCmulf(a.ptr(y)[x], b.ptr(y)[x]);
-                c.ptr(y)[x] = make_cuFloatComplex(cuCrealf(v) * scale, cuCimagf(v) * scale);
-            }
-        }
-
-
-        void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c, cudaStream_t stream)
-        {
-            dim3 threads(256);
-            dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
-
-            mulAndScaleSpectrumsKernel<<<grid, threads, 0, stream>>>(a, b, scale, c);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-
-        //////////////////////////////////////////////////////////////////////////
-        // mulAndScaleSpectrums_CONJ
-
-        __global__ void mulAndScaleSpectrumsKernel_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x < c.cols && y < c.rows)
-            {
-                cufftComplex v = cuCmulf(a.ptr(y)[x], cuConjf(b.ptr(y)[x]));
-                c.ptr(y)[x] = make_cuFloatComplex(cuCrealf(v) * scale, cuCimagf(v) * scale);
-            }
-        }
-
-
-        void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c, cudaStream_t stream)
-        {
-            dim3 threads(256);
-            dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
-
-            mulAndScaleSpectrumsKernel_CONJ<<<grid, threads, 0, stream>>>(a, b, scale, c);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
        //////////////////////////////////////////////////////////////////////////
        // buildWarpMaps

--- a/modules/gpu/src/cuda/safe_call.hpp
+++ b/modules/gpu/src/cuda/safe_call.hpp
@@ -45,21 +45,6 @@

 #include <cufft.h>

-#if defined(__GNUC__)
-    #define cufftSafeCall(expr)  ___cufftSafeCall(expr, __FILE__, __LINE__, __func__)
-#else /* defined(__CUDACC__) || defined(__MSVC__) */
-    #define cufftSafeCall(expr)  ___cufftSafeCall(expr, __FILE__, __LINE__)
-#endif

-namespace cv { namespace gpu
-{
-    void cufftError(int err, const char *file, const int line, const char *func = "");
-}}
-
-static inline void ___cufftSafeCall(cufftResult_t err, const char *file, const int line, const char *func = "")
-{
-    if (CUFFT_SUCCESS != err)
-        cv::gpu::cufftError(err, file, line, func);
-}

 #endif /* __OPENCV_CUDA_SAFE_CALL_HPP__ */
--- a/modules/gpu/src/error.cpp
+++ b/modules/gpu/src/error.cpp
@@ -43,65 +43,3 @@

 using namespace cv;
 using namespace cv::gpu;
-
-#ifdef HAVE_CUDA
-
-namespace
-{
-    #define error_entry(entry)  { entry, #entry }
-
-    struct ErrorEntry
-    {
-        int code;
-        const char* str;
-    };
-
-    struct ErrorEntryComparer
-    {
-        int code;
-        ErrorEntryComparer(int code_) : code(code_) {}
-        bool operator()(const ErrorEntry& e) const { return e.code == code; }
-    };
-
-    String getErrorString(int code, const ErrorEntry* errors, size_t n)
-    {
-        size_t idx = std::find_if(errors, errors + n, ErrorEntryComparer(code)) - errors;
-
-        const char* msg = (idx != n) ? errors[idx].str : "Unknown error code";
-        String str = cv::format("%s [Code = %d]", msg, code);
-
-        return str;
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    // CUFFT errors
-
-    const ErrorEntry cufft_errors[] =
-    {
-        error_entry( CUFFT_INVALID_PLAN ),
-        error_entry( CUFFT_ALLOC_FAILED ),
-        error_entry( CUFFT_INVALID_TYPE ),
-        error_entry( CUFFT_INVALID_VALUE ),
-        error_entry( CUFFT_INTERNAL_ERROR ),
-        error_entry( CUFFT_EXEC_FAILED ),
-        error_entry( CUFFT_SETUP_FAILED ),
-        error_entry( CUFFT_INVALID_SIZE ),
-        error_entry( CUFFT_UNALIGNED_DATA )
-    };
-
-    const int cufft_error_num = sizeof(cufft_errors) / sizeof(cufft_errors[0]);
-}
-
-namespace cv
-{
-    namespace gpu
-    {
-        void cufftError(int code, const char* file, const int line, const char* func)
-        {
-            String msg = getErrorString(code, cufft_errors, cufft_error_num);
-            cv::error(cv::Error::GpuApiCallError, msg, func, file, line);
-        }
-    }
-}
-
-#endif
--- a/modules/gpu/src/imgproc.cpp
+++ b/modules/gpu/src/imgproc.cpp
@@ -73,12 +73,6 @@ void cv::gpu::cornerHarris(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, in
 void cv::gpu::cornerMinEigenVal(const GpuMat&, GpuMat&, int, int, int) { throw_no_cuda(); }
 void cv::gpu::cornerMinEigenVal(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, int, int) { throw_no_cuda(); }
 void cv::gpu::cornerMinEigenVal(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, int, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::mulSpectrums(const GpuMat&, const GpuMat&, GpuMat&, int, bool, Stream&) { throw_no_cuda(); }
-void cv::gpu::mulAndScaleSpectrums(const GpuMat&, const GpuMat&, GpuMat&, int, float, bool, Stream&) { throw_no_cuda(); }
-void cv::gpu::dft(const GpuMat&, GpuMat&, Size, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::ConvolveBuf::create(Size, Size) { throw_no_cuda(); }
-void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool) { throw_no_cuda(); }
-void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool, ConvolveBuf&, Stream&) { throw_no_cuda(); }
 void cv::gpu::Canny(const GpuMat&, GpuMat&, double, double, int, bool) { throw_no_cuda(); }
 void cv::gpu::Canny(const GpuMat&, CannyBuf&, GpuMat&, double, double, int, bool) { throw_no_cuda(); }
 void cv::gpu::Canny(const GpuMat&, const GpuMat&, GpuMat&, double, double, bool) { throw_no_cuda(); }
@@ -848,299 +842,6 @@ void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuM
    cornerMinEigenVal_gpu(blockSize, Dx, Dy, dst, gpuBorderType, StreamAccessor::getStream(stream));
 }

-//////////////////////////////////////////////////////////////////////////////
-// mulSpectrums
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c, cudaStream_t stream);
-
-        void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c, cudaStream_t stream);
-    }
-}}}
-
-void cv::gpu::mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, bool conjB, Stream& stream)
-{
-    (void)flags;
-    using namespace ::cv::gpu::cudev::imgproc;
-
-    typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, PtrStepSz<cufftComplex>, cudaStream_t stream);
-
-    static Caller callers[] = { cudev::imgproc::mulSpectrums, cudev::imgproc::mulSpectrums_CONJ };
-
-    CV_Assert(a.type() == b.type() && a.type() == CV_32FC2);
-    CV_Assert(a.size() == b.size());
-
-    c.create(a.size(), CV_32FC2);
-
-    Caller caller = callers[(int)conjB];
-    caller(a, b, c, StreamAccessor::getStream(stream));
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// mulAndScaleSpectrums
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c, cudaStream_t stream);
-
-        void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c, cudaStream_t stream);
-    }
-}}}
-
-void cv::gpu::mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, float scale, bool conjB, Stream& stream)
-{
-    (void)flags;
-    using namespace ::cv::gpu::cudev::imgproc;
-
-    typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, float scale, PtrStepSz<cufftComplex>, cudaStream_t stream);
-    static Caller callers[] = { cudev::imgproc::mulAndScaleSpectrums, cudev::imgproc::mulAndScaleSpectrums_CONJ };
-
-    CV_Assert(a.type() == b.type() && a.type() == CV_32FC2);
-    CV_Assert(a.size() == b.size());
-
-    c.create(a.size(), CV_32FC2);
-
-    Caller caller = callers[(int)conjB];
-    caller(a, b, scale, c, StreamAccessor::getStream(stream));
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// dft
-
-void cv::gpu::dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags, Stream& stream)
-{
-#ifndef HAVE_CUFFT
-
-    OPENCV_GPU_UNUSED(src);
-    OPENCV_GPU_UNUSED(dst);
-    OPENCV_GPU_UNUSED(dft_size);
-    OPENCV_GPU_UNUSED(flags);
-    OPENCV_GPU_UNUSED(stream);
-
-    throw_no_cuda();
-
-#else
-
-    CV_Assert(src.type() == CV_32F || src.type() == CV_32FC2);
-
-    // We don't support unpacked output (in the case of real input)
-    CV_Assert(!(flags & DFT_COMPLEX_OUTPUT));
-
-    bool is_1d_input = (dft_size.height == 1) || (dft_size.width == 1);
-    int is_row_dft = flags & DFT_ROWS;
-    int is_scaled_dft = flags & DFT_SCALE;
-    int is_inverse = flags & DFT_INVERSE;
-    bool is_complex_input = src.channels() == 2;
-    bool is_complex_output = !(flags & DFT_REAL_OUTPUT);
-
-    // We don't support real-to-real transform
-    CV_Assert(is_complex_input || is_complex_output);
-
-    GpuMat src_data;
-
-    // Make sure here we work with the continuous input,
-    // as CUFFT can't handle gaps
-    src_data = src;
-    createContinuous(src.rows, src.cols, src.type(), src_data);
-    if (src_data.data != src.data)
-        src.copyTo(src_data);
-
-    Size dft_size_opt = dft_size;
-    if (is_1d_input && !is_row_dft)
-    {
-        // If the source matrix is single column handle it as single row
-        dft_size_opt.width = std::max(dft_size.width, dft_size.height);
-        dft_size_opt.height = std::min(dft_size.width, dft_size.height);
-    }
-
-    cufftType dft_type = CUFFT_R2C;
-    if (is_complex_input)
-        dft_type = is_complex_output ? CUFFT_C2C : CUFFT_C2R;
-
-    CV_Assert(dft_size_opt.width > 1);
-
-    cufftHandle plan;
-    if (is_1d_input || is_row_dft)
-        cufftPlan1d(&plan, dft_size_opt.width, dft_type, dft_size_opt.height);
-    else
-        cufftPlan2d(&plan, dft_size_opt.height, dft_size_opt.width, dft_type);
-
-    cufftSafeCall( cufftSetStream(plan, StreamAccessor::getStream(stream)) );
-
-    if (is_complex_input)
-    {
-        if (is_complex_output)
-        {
-            createContinuous(dft_size, CV_32FC2, dst);
-            cufftSafeCall(cufftExecC2C(
-                    plan, src_data.ptr<cufftComplex>(), dst.ptr<cufftComplex>(),
-                    is_inverse ? CUFFT_INVERSE : CUFFT_FORWARD));
-        }
-        else
-        {
-            createContinuous(dft_size, CV_32F, dst);
-            cufftSafeCall(cufftExecC2R(
-                    plan, src_data.ptr<cufftComplex>(), dst.ptr<cufftReal>()));
-        }
-    }
-    else
-    {
-        // We could swap dft_size for efficiency. Here we must reflect it
-        if (dft_size == dft_size_opt)
-            createContinuous(Size(dft_size.width / 2 + 1, dft_size.height), CV_32FC2, dst);
-        else
-            createContinuous(Size(dft_size.width, dft_size.height / 2 + 1), CV_32FC2, dst);
-
-        cufftSafeCall(cufftExecR2C(
-                plan, src_data.ptr<cufftReal>(), dst.ptr<cufftComplex>()));
-    }
-
-    cufftSafeCall(cufftDestroy(plan));
-
-    if (is_scaled_dft)
-        multiply(dst, Scalar::all(1. / dft_size.area()), dst, 1, -1, stream);
-
-#endif
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// convolve
-
-void cv::gpu::ConvolveBuf::create(Size image_size, Size templ_size)
-{
-    result_size = Size(image_size.width - templ_size.width + 1,
-                       image_size.height - templ_size.height + 1);
-
-    block_size = user_block_size;
-    if (user_block_size.width == 0 || user_block_size.height == 0)
-        block_size = estimateBlockSize(result_size, templ_size);
-
-    dft_size.width = 1 << int(ceil(std::log(block_size.width + templ_size.width - 1.) / std::log(2.)));
-    dft_size.height = 1 << int(ceil(std::log(block_size.height + templ_size.height - 1.) / std::log(2.)));
-
-    // CUFFT has hard-coded kernels for power-of-2 sizes (up to 8192),
-    // see CUDA Toolkit 4.1 CUFFT Library Programming Guide
-    if (dft_size.width > 8192)
-        dft_size.width = getOptimalDFTSize(block_size.width + templ_size.width - 1);
-    if (dft_size.height > 8192)
-        dft_size.height = getOptimalDFTSize(block_size.height + templ_size.height - 1);
-
-    // To avoid wasting time doing small DFTs
-    dft_size.width = std::max(dft_size.width, 512);
-    dft_size.height = std::max(dft_size.height, 512);
-
-    createContinuous(dft_size, CV_32F, image_block);
-    createContinuous(dft_size, CV_32F, templ_block);
-    createContinuous(dft_size, CV_32F, result_data);
-
-    spect_len = dft_size.height * (dft_size.width / 2 + 1);
-    createContinuous(1, spect_len, CV_32FC2, image_spect);
-    createContinuous(1, spect_len, CV_32FC2, templ_spect);
-    createContinuous(1, spect_len, CV_32FC2, result_spect);
-
-    // Use maximum result matrix block size for the estimated DFT block size
-    block_size.width = std::min(dft_size.width - templ_size.width + 1, result_size.width);
-    block_size.height = std::min(dft_size.height - templ_size.height + 1, result_size.height);
-}
-
-
-Size cv::gpu::ConvolveBuf::estimateBlockSize(Size result_size, Size /*templ_size*/)
-{
-    int width = (result_size.width + 2) / 3;
-    int height = (result_size.height + 2) / 3;
-    width = std::min(width, result_size.width);
-    height = std::min(height, result_size.height);
-    return Size(width, height);
-}
-
-
-void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr)
-{
-    ConvolveBuf buf;
-    convolve(image, templ, result, ccorr, buf);
-}
-
-void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr, ConvolveBuf& buf, Stream& stream)
-{
-    using namespace ::cv::gpu::cudev::imgproc;
-
-#ifndef HAVE_CUFFT
-    throw_no_cuda();
-#else
-    CV_Assert(image.type() == CV_32F);
-    CV_Assert(templ.type() == CV_32F);
-
-    buf.create(image.size(), templ.size());
-    result.create(buf.result_size, CV_32F);
-
-    Size& block_size = buf.block_size;
-    Size& dft_size = buf.dft_size;
-
-    GpuMat& image_block = buf.image_block;
-    GpuMat& templ_block = buf.templ_block;
-    GpuMat& result_data = buf.result_data;
-
-    GpuMat& image_spect = buf.image_spect;
-    GpuMat& templ_spect = buf.templ_spect;
-    GpuMat& result_spect = buf.result_spect;
-
-    cufftHandle planR2C, planC2R;
-    cufftSafeCall(cufftPlan2d(&planC2R, dft_size.height, dft_size.width, CUFFT_C2R));
-    cufftSafeCall(cufftPlan2d(&planR2C, dft_size.height, dft_size.width, CUFFT_R2C));
-
-    cufftSafeCall( cufftSetStream(planR2C, StreamAccessor::getStream(stream)) );
-    cufftSafeCall( cufftSetStream(planC2R, StreamAccessor::getStream(stream)) );
-
-    GpuMat templ_roi(templ.size(), CV_32F, templ.data, templ.step);
-    copyMakeBorder(templ_roi, templ_block, 0, templ_block.rows - templ_roi.rows, 0,
-                   templ_block.cols - templ_roi.cols, 0, Scalar(), stream);
-
-    cufftSafeCall(cufftExecR2C(planR2C, templ_block.ptr<cufftReal>(),
-                               templ_spect.ptr<cufftComplex>()));
-
-    // Process all blocks of the result matrix
-    for (int y = 0; y < result.rows; y += block_size.height)
-    {
-        for (int x = 0; x < result.cols; x += block_size.width)
-        {
-            Size image_roi_size(std::min(x + dft_size.width, image.cols) - x,
-                                std::min(y + dft_size.height, image.rows) - y);
-            GpuMat image_roi(image_roi_size, CV_32F, (void*)(image.ptr<float>(y) + x),
-                             image.step);
-            copyMakeBorder(image_roi, image_block, 0, image_block.rows - image_roi.rows,
-                           0, image_block.cols - image_roi.cols, 0, Scalar(), stream);
-
-            cufftSafeCall(cufftExecR2C(planR2C, image_block.ptr<cufftReal>(),
-                                       image_spect.ptr<cufftComplex>()));
-            mulAndScaleSpectrums(image_spect, templ_spect, result_spect, 0,
-                                 1.f / dft_size.area(), ccorr, stream);
-            cufftSafeCall(cufftExecC2R(planC2R, result_spect.ptr<cufftComplex>(),
-                                       result_data.ptr<cufftReal>()));
-
-            Size result_roi_size(std::min(x + block_size.width, result.cols) - x,
-                                 std::min(y + block_size.height, result.rows) - y);
-            GpuMat result_roi(result_roi_size, result.type(),
-                              (void*)(result.ptr<float>(y) + x), result.step);
-            GpuMat result_block(result_roi_size, result_data.type(),
-                                result_data.ptr(), result_data.step);
-
-            if (stream)
-                stream.enqueueCopy(result_block, result_roi);
-            else
-                result_block.copyTo(result_roi);
-        }
-    }
-
-    cufftSafeCall(cufftDestroy(planR2C));
-    cufftSafeCall(cufftDestroy(planC2R));
-#endif
-}
-

 //////////////////////////////////////////////////////////////////////////////
 // Canny