remove softcascade host dependencies on gpu module

2013-03-03 13:01:17 +04:00 · 2013-03-03 13:01:17 +04:00 · 6daf17f974
commit 6daf17f974
parent 5120322cea
8 changed files with 274 additions and 18 deletions
--- a/modules/core/include/opencv2/core/gpumat.hpp
+++ b/modules/core/include/opencv2/core/gpumat.hpp
@ -50,6 +50,141 @@

 namespace cv { namespace gpu
 {
+    //////////////////////////////// CudaMem ////////////////////////////////
+    // CudaMem is limited cv::Mat with page locked memory allocation.
+    // Page locked memory is only needed for async and faster coping to GPU.
+    // It is convertable to cv::Mat header without reference counting
+    // so you can use it with other opencv functions.
+
+    // Page-locks the matrix m memory and maps it for the device(s)
+    CV_EXPORTS void registerPageLocked(Mat& m);
+    // Unmaps the memory of matrix m, and makes it pageable again.
+    CV_EXPORTS void unregisterPageLocked(Mat& m);
+
+    class CV_EXPORTS CudaMem
+    {
+    public:
+        enum  { ALLOC_PAGE_LOCKED = 1, ALLOC_ZEROCOPY = 2, ALLOC_WRITE_COMBINED = 4 };
+
+        CudaMem();
+        CudaMem(const CudaMem& m);
+
+        CudaMem(int rows, int cols, int type, int _alloc_type = ALLOC_PAGE_LOCKED);
+        CudaMem(Size size, int type, int alloc_type = ALLOC_PAGE_LOCKED);
+
+
+        //! creates from cv::Mat with coping data
+        explicit CudaMem(const Mat& m, int alloc_type = ALLOC_PAGE_LOCKED);
+
+        ~CudaMem();
+
+        CudaMem& operator = (const CudaMem& m);
+
+        //! returns deep copy of the matrix, i.e. the data is copied
+        CudaMem clone() const;
+
+        //! allocates new matrix data unless the matrix already has specified size and type.
+        void create(int rows, int cols, int type, int alloc_type = ALLOC_PAGE_LOCKED);
+        void create(Size size, int type, int alloc_type = ALLOC_PAGE_LOCKED);
+
+        //! decrements reference counter and released memory if needed.
+        void release();
+
+        //! returns matrix header with disabled reference counting for CudaMem data.
+        Mat createMatHeader() const;
+        operator Mat() const;
+
+        //! maps host memory into device address space and returns GpuMat header for it. Throws exception if not supported by hardware.
+        GpuMat createGpuMatHeader() const;
+        operator GpuMat() const;
+
+        //returns if host memory can be mapperd to gpu address space;
+        static bool canMapHostMemory();
+
+        // Please see cv::Mat for descriptions
+        bool isContinuous() const;
+        size_t elemSize() const;
+        size_t elemSize1() const;
+        int type() const;
+        int depth() const;
+        int channels() const;
+        size_t step1() const;
+        Size size() const;
+        bool empty() const;
+
+
+        // Please see cv::Mat for descriptions
+        int flags;
+        int rows, cols;
+        size_t step;
+
+        uchar* data;
+        int* refcount;
+
+        uchar* datastart;
+        uchar* dataend;
+
+        int alloc_type;
+    };
+
+
+    //////////////////////////////// CudaStream ////////////////////////////////
+    // Encapculates Cuda Stream. Provides interface for async coping.
+    // Passed to each function that supports async kernel execution.
+    // Reference counting is enabled
+
+    class CV_EXPORTS Stream
+    {
+    public:
+        Stream();
+        ~Stream();
+
+        Stream(const Stream&);
+        Stream& operator =(const Stream&);
+
+        bool queryIfComplete();
+        void waitForCompletion();
+
+        //! downloads asynchronously
+        // Warning! cv::Mat must point to page locked memory (i.e. to CudaMem data or to its subMat)
+        void enqueueDownload(const GpuMat& src, CudaMem& dst);
+        void enqueueDownload(const GpuMat& src, Mat& dst);
+
+        //! uploads asynchronously
+        // Warning! cv::Mat must point to page locked memory (i.e. to CudaMem data or to its ROI)
+        void enqueueUpload(const CudaMem& src, GpuMat& dst);
+        void enqueueUpload(const Mat& src, GpuMat& dst);
+
+        //! copy asynchronously
+        void enqueueCopy(const GpuMat& src, GpuMat& dst);
+
+        //! memory set asynchronously
+        void enqueueMemSet(GpuMat& src, Scalar val);
+        void enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask);
+
+        //! converts matrix type, ex from float to uchar depending on type
+        void enqueueConvert(const GpuMat& src, GpuMat& dst, int dtype, double a = 1, double b = 0);
+
+        //! adds a callback to be called on the host after all currently enqueued items in the stream have completed
+        typedef void (*StreamCallback)(Stream& stream, int status, void* userData);
+        void enqueueHostCallback(StreamCallback callback, void* userData);
+
+        static Stream& Null();
+
+        operator bool() const;
+
+    private:
+        struct Impl;
+
+        explicit Stream(Impl* impl);
+        void create();
+        void release();
+
+        Impl *impl;
+
+        friend struct StreamAccessor;
+    };
+
    //////////////////////////////// Initialization & Info ////////////////////////

    //! This is the only function that do not throw exceptions if the library is compiled without Cuda.
--- a/modules/core/include/opencv2/core/stream_accessor.hpp
+++ b/modules/core/include/opencv2/core/stream_accessor.hpp
@ -0,0 +1,64 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_STREAM_ACCESSOR_HPP__
+#define __OPENCV_GPU_STREAM_ACCESSOR_HPP__
+
+#include "opencv2/gpu/gpu.hpp"
+#include "cuda_runtime_api.h"
+
+namespace cv
+{
+    namespace gpu
+    {
+        // This is only header file that depends on Cuda. All other headers are independent.
+        // So if you use OpenCV binaries you do noot need to install Cuda Toolkit.
+        // But of you wanna use GPU by yourself, may get cuda stream instance using the class below.
+        // In this case you have to install Cuda Toolkit.
+        struct StreamAccessor
+        {
+            CV_EXPORTS static cudaStream_t getStream(const Stream& stream);
+        };
+    }
+}
+
+#endif /* __OPENCV_GPU_STREAM_ACCESSOR_HPP__ */
--- a/modules/core/src/cudastream.cpp
+++ b/modules/core/src/cudastream.cpp
@ -72,6 +72,33 @@ void cv::gpu::Stream::release() { throw_nogpu(); }

 #include "opencv2/gpu/stream_accessor.hpp"

+namespace
+{
+#if defined(__GNUC__)
+    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__, __func__)
+    #define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__, __func__)
+#else /* defined(__CUDACC__) || defined(__MSVC__) */
+    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__)
+    #define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__)
+#endif
+
+    inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
+    {
+        if (cudaSuccess != err)
+            cv::gpu::error(cudaGetErrorString(err), file, line, func);
+    }
+
+    inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
+    {
+        if (err < 0)
+        {
+            std::ostringstream msg;
+            msg << "NPP API Call Error: " << err;
+            cv::gpu::error(msg.str().c_str(), file, line, func);
+        }
+    }
+}
+
 namespace cv { namespace gpu
 {
    void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream);
--- a/modules/core/src/matrix_operations.cpp
+++ b/modules/core/src/matrix_operations.cpp
@ -41,6 +41,7 @@
 //M*/

 #include "precomp.hpp"
+#include "opencv2/core/gpumat.hpp"

 using namespace cv;
 using namespace cv::gpu;
@ -178,7 +179,7 @@ bool cv::gpu::CudaMem::empty() const
    return data == 0;
 }

-#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+#if !defined (HAVE_CUDA)

 void cv::gpu::registerPageLocked(Mat&) { throw_nogpu(); }
 void cv::gpu::unregisterPageLocked(Mat&) { throw_nogpu(); }
@ -188,6 +189,33 @@ void cv::gpu::CudaMem::release() { throw_nogpu(); }
 GpuMat cv::gpu::CudaMem::createGpuMatHeader () const { throw_nogpu(); return GpuMat(); }

 #else /* !defined (HAVE_CUDA) */
+#include <cuda_runtime_api.h>
+namespace
+{
+#if defined(__GNUC__)
+    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__, __func__)
+    #define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__, __func__)
+#else /* defined(__CUDACC__) || defined(__MSVC__) */
+    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__)
+    #define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__)
+#endif
+
+    inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
+    {
+        if (cudaSuccess != err)
+            cv::gpu::error(cudaGetErrorString(err), file, line, func);
+    }
+
+    inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
+    {
+        if (err < 0)
+        {
+            std::ostringstream msg;
+            msg << "NPP API Call Error: " << err;
+            cv::gpu::error(msg.str().c_str(), file, line, func);
+        }
+    }
+}

 void cv::gpu::registerPageLocked(Mat& m)
 {
--- a/modules/gpu/include/opencv2/gpu/stream_accessor.hpp
+++ b/modules/gpu/include/opencv2/gpu/stream_accessor.hpp
@ -56,7 +56,7 @@ namespace cv
        // In this case you have to install Cuda Toolkit.
        struct StreamAccessor
        {
-            CV_EXPORTS static cudaStream_t getStream(const Stream& stream);
+            CV_EXPORTS static cudaStream_t getStream(const cv::gpu::Stream& stream);
        };
    }
 }
--- a/modules/softcascade/CMakeLists.txt
+++ b/modules/softcascade/CMakeLists.txt
@ -9,7 +9,7 @@ macro(ocv_glob_cuda_powered_module_sources)
  set(lib_device_hdrs "")

  if (HAVE_CUDA AND lib_device_srcs)
-  ocv_include_directories(${CUDA_INCLUDE_DIRS})
+  ocv_include_directories(${CUDA_INCLUDE_DIRS} "${OpenCV_SOURCE_DIR}/modules/gpu/include")
  file(GLOB_RECURSE lib_device_hdrs "src/cuda/*.hpp")

  ocv_cuda_compile(device_objs ${lib_device_srcs})
@ -35,7 +35,7 @@ set(cuda_include "")
 if (NAVE_CUDA)
  set(cuda_deps ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
 endif()
-ocv_add_module(softcascade opencv_core opencv_imgproc opencv_ml OPTIONAL opencv_gpu ${cuda_deps})
+ocv_add_module(softcascade opencv_core opencv_imgproc opencv_ml OPTIONAL ${cuda_deps})

 if(HAVE_CUDA)
  ocv_module_include_directories(${CUDA_INCLUDE_DIRS})
--- a/modules/softcascade/src/cuda/icf-sc.cu
+++ b/modules/softcascade/src/cuda/icf-sc.cu
@ -49,6 +49,8 @@

 namespace cv { namespace softcascade { namespace device {

+typedef unsigned char uchar;
+
    template <int FACTOR>
    __device__ __forceinline__ uchar shrink(const uchar* ptr, const int pitch, const int y, const int x)
    {
--- a/modules/softcascade/src/detector_cuda.cpp
+++ b/modules/softcascade/src/detector_cuda.cpp
@ -480,8 +480,8 @@ void cv::softcascade::SCascade::detect(InputArray _image, InputArray _rois, Outp

    flds.mask.create( rois.cols / shr, rois.rows / shr, rois.type());

-    cv::gpu::resize(rois, flds.genRoiTmp, cv::Size(), 1.f / shr, 1.f / shr, CV_INTER_AREA, s);
-    cv::gpu::transpose(flds.genRoiTmp, flds.mask, s);
+    //cv::gpu::resize(rois, flds.genRoiTmp, cv::Size(), 1.f / shr, 1.f / shr, CV_INTER_AREA, s);
+    //cv::gpu::transpose(flds.genRoiTmp, flds.mask, s);

    if (type == CV_8UC3)
    {
@ -491,7 +491,7 @@ void cv::softcascade::SCascade::detect(InputArray _image, InputArray _rois, Outp
            flds.createLevels(image.rows, image.cols);

        flds.preprocessor->apply(image, flds.shrunk);
-        cv::gpu::integralBuffered(flds.shrunk, flds.hogluv, flds.integralBuffer, s);
+        //cv::gpu::integralBuffered(flds.shrunk, flds.hogluv, flds.integralBuffer, s);
    }
    else
    {
@ -546,12 +546,12 @@ struct GenricPreprocessor : public cv::softcascade::ChannelsProcessor
        channels.create(frame.rows * (4 + bins), frame.cols, CV_8UC1);
        setZero(channels, s);

-        cv::gpu::cvtColor(frame, gray, CV_BGR2GRAY, s);
+        //cv::gpu::cvtColor(frame, gray, CV_BGR2GRAY, s);
        createHogBins(s);

        createLuvBins(frame, s);

-        cv::gpu::resize(channels, shrunk, cv::Size(), 1.f / shrinkage, 1.f / shrinkage, CV_INTER_AREA, s);
+        //cv::gpu::resize(channels, shrunk, cv::Size(), 1.f / shrinkage, 1.f / shrinkage, CV_INTER_AREA, s);
    }

 private:
@ -566,20 +566,20 @@ private:
        cv::gpu::GpuMat dfdx(fplane, cv::Rect(0,  0, fw, fh));
        cv::gpu::GpuMat dfdy(fplane, cv::Rect(0, fh, fw, fh));

-        cv::gpu::Sobel(gray, dfdx, CV_32F, 1, 0, sobelBuf, 3, 1, cv::BORDER_DEFAULT, -1, s);
-        cv::gpu::Sobel(gray, dfdy, CV_32F, 0, 1, sobelBuf, 3, 1, cv::BORDER_DEFAULT, -1, s);
+        //cv::gpu::Sobel(gray, dfdx, CV_32F, 1, 0, sobelBuf, 3, 1, cv::BORDER_DEFAULT, -1, s);
+        //cv::gpu::Sobel(gray, dfdy, CV_32F, 0, 1, sobelBuf, 3, 1, cv::BORDER_DEFAULT, -1, s);

        cv::gpu::GpuMat mag(fplane, cv::Rect(0, 2 * fh, fw, fh));
        cv::gpu::GpuMat ang(fplane, cv::Rect(0, 3 * fh, fw, fh));

-        cv::gpu::cartToPolar(dfdx, dfdy, mag, ang, true, s);
+        //cv::gpu::cartToPolar(dfdx, dfdy, mag, ang, true, s);

        // normalize magnitude to uchar interval and angles to 6 bins
        cv::gpu::GpuMat nmag(fplane, cv::Rect(0, 4 * fh, fw, fh));
        cv::gpu::GpuMat nang(fplane, cv::Rect(0, 5 * fh, fw, fh));

-        cv::gpu::multiply(mag, cv::Scalar::all(1.f / (8 *::log(2.0f))), nmag, 1, -1, s);
-        cv::gpu::multiply(ang, cv::Scalar::all(1.f / 60.f),     nang, 1, -1, s);
+        //cv::gpu::multiply(mag, cv::Scalar::all(1.f / (8 *::log(2.0f))), nmag, 1, -1, s);
+        //cv::gpu::multiply(ang, cv::Scalar::all(1.f / 60.f),     nang, 1, -1, s);

        //create uchar magnitude
        cv::gpu::GpuMat cmag(channels, cv::Rect(0, fh * HOG_BINS, fw, fh));
@ -597,7 +597,7 @@ private:
        static const int fw = colored.cols;
        static const int fh = colored.rows;

-        cv::gpu::cvtColor(colored, luv, CV_BGR2Luv, s);
+        //cv::gpu::cvtColor(colored, luv, CV_BGR2Luv, s);

        std::vector<cv::gpu::GpuMat> splited;
        for(int i = 0; i < LUV_BINS; ++i)
@ -605,7 +605,7 @@ private:
            splited.push_back(cv::gpu::GpuMat(channels, cv::Rect(0, fh * (7 + i), fw, fh)));
        }

-        cv::gpu::split(luv, splited, s);
+        //cv::gpu::split(luv, splited, s);
    }

    enum {HOG_BINS = 6, LUV_BINS = 3};
@ -631,7 +631,7 @@ struct SeparablePreprocessor : public cv::softcascade::ChannelsProcessor
    virtual void apply(InputArray _frame, OutputArray _shrunk, cv::gpu::Stream& s = cv::gpu::Stream::Null())
    {
        const cv::gpu::GpuMat frame = _frame.getGpuMat();
-        cv::gpu::GaussianBlur(frame, bgr, cv::Size(3, 3), -1.0);
+        //cv::gpu::GaussianBlur(frame, bgr, cv::Size(3, 3), -1.0);

        _shrunk.create(frame.rows * (4 + bins) / shrinkage, frame.cols / shrinkage, CV_8UC1);
        cv::gpu::GpuMat shrunk = _shrunk.getGpuMat();
@ -639,7 +639,7 @@ struct SeparablePreprocessor : public cv::softcascade::ChannelsProcessor
        channels.create(frame.rows * (4 + bins), frame.cols, CV_8UC1);
        setZero(channels, s);

-        cv::gpu::cvtColor(bgr, gray, CV_BGR2GRAY);
+        //cv::gpu::cvtColor(bgr, gray, CV_BGR2GRAY);
        cv::softcascade::device::gray2hog(gray, channels(cv::Rect(0, 0, bgr.cols, bgr.rows * (bins + 1))), bins);

        cv::gpu::GpuMat luv(channels, cv::Rect(0, bgr.rows * (bins + 1), bgr.cols, bgr.rows * 3));