From 6daf17f9740b3a6ca0f9144f07ed32e5199e24e3 Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Sun, 3 Mar 2013 13:01:17 +0400
Subject: [PATCH] remove softcascade host dependencies on gpu module

---
 modules/core/include/opencv2/core/gpumat.hpp  | 135 ++++++++++++++++++
 .../include/opencv2/core/stream_accessor.hpp  |  64 +++++++++
 modules/{gpu => core}/src/cudastream.cpp      |  27 ++++
 .../{gpu => core}/src/matrix_operations.cpp   |  30 +++-
 .../include/opencv2/gpu/stream_accessor.hpp   |   2 +-
 modules/softcascade/CMakeLists.txt            |   4 +-
 modules/softcascade/src/cuda/icf-sc.cu        |   2 +
 modules/softcascade/src/detector_cuda.cpp     |  28 ++--
 8 files changed, 274 insertions(+), 18 deletions(-)
 create mode 100644 modules/core/include/opencv2/core/stream_accessor.hpp
 rename modules/{gpu => core}/src/cudastream.cpp (92%)
 rename modules/{gpu => core}/src/matrix_operations.cpp (89%)

diff --git a/modules/core/include/opencv2/core/gpumat.hpp b/modules/core/include/opencv2/core/gpumat.hpp
index be757a935..bb436ec55 100644
--- a/modules/core/include/opencv2/core/gpumat.hpp
+++ b/modules/core/include/opencv2/core/gpumat.hpp
@@ -50,6 +50,141 @@
 
 namespace cv { namespace gpu
 {
+    //////////////////////////////// CudaMem ////////////////////////////////
+    // CudaMem is limited cv::Mat with page locked memory allocation.
+    // Page locked memory is only needed for async and faster coping to GPU.
+    // It is convertable to cv::Mat header without reference counting
+    // so you can use it with other opencv functions.
+
+    // Page-locks the matrix m memory and maps it for the device(s)
+    CV_EXPORTS void registerPageLocked(Mat& m);
+    // Unmaps the memory of matrix m, and makes it pageable again.
+    CV_EXPORTS void unregisterPageLocked(Mat& m);
+
+    class CV_EXPORTS CudaMem
+    {
+    public:
+        enum  { ALLOC_PAGE_LOCKED = 1, ALLOC_ZEROCOPY = 2, ALLOC_WRITE_COMBINED = 4 };
+
+        CudaMem();
+        CudaMem(const CudaMem& m);
+
+        CudaMem(int rows, int cols, int type, int _alloc_type = ALLOC_PAGE_LOCKED);
+        CudaMem(Size size, int type, int alloc_type = ALLOC_PAGE_LOCKED);
+
+
+        //! creates from cv::Mat with coping data
+        explicit CudaMem(const Mat& m, int alloc_type = ALLOC_PAGE_LOCKED);
+
+        ~CudaMem();
+
+        CudaMem& operator = (const CudaMem& m);
+
+        //! returns deep copy of the matrix, i.e. the data is copied
+        CudaMem clone() const;
+
+        //! allocates new matrix data unless the matrix already has specified size and type.
+        void create(int rows, int cols, int type, int alloc_type = ALLOC_PAGE_LOCKED);
+        void create(Size size, int type, int alloc_type = ALLOC_PAGE_LOCKED);
+
+        //! decrements reference counter and released memory if needed.
+        void release();
+
+        //! returns matrix header with disabled reference counting for CudaMem data.
+        Mat createMatHeader() const;
+        operator Mat() const;
+
+        //! maps host memory into device address space and returns GpuMat header for it. Throws exception if not supported by hardware.
+        GpuMat createGpuMatHeader() const;
+        operator GpuMat() const;
+
+        //returns if host memory can be mapperd to gpu address space;
+        static bool canMapHostMemory();
+
+        // Please see cv::Mat for descriptions
+        bool isContinuous() const;
+        size_t elemSize() const;
+        size_t elemSize1() const;
+        int type() const;
+        int depth() const;
+        int channels() const;
+        size_t step1() const;
+        Size size() const;
+        bool empty() const;
+
+
+        // Please see cv::Mat for descriptions
+        int flags;
+        int rows, cols;
+        size_t step;
+
+        uchar* data;
+        int* refcount;
+
+        uchar* datastart;
+        uchar* dataend;
+
+        int alloc_type;
+    };
+
+
+    //////////////////////////////// CudaStream ////////////////////////////////
+    // Encapculates Cuda Stream. Provides interface for async coping.
+    // Passed to each function that supports async kernel execution.
+    // Reference counting is enabled
+
+    class CV_EXPORTS Stream
+    {
+    public:
+        Stream();
+        ~Stream();
+
+        Stream(const Stream&);
+        Stream& operator =(const Stream&);
+
+        bool queryIfComplete();
+        void waitForCompletion();
+
+        //! downloads asynchronously
+        // Warning! cv::Mat must point to page locked memory (i.e. to CudaMem data or to its subMat)
+        void enqueueDownload(const GpuMat& src, CudaMem& dst);
+        void enqueueDownload(const GpuMat& src, Mat& dst);
+
+        //! uploads asynchronously
+        // Warning! cv::Mat must point to page locked memory (i.e. to CudaMem data or to its ROI)
+        void enqueueUpload(const CudaMem& src, GpuMat& dst);
+        void enqueueUpload(const Mat& src, GpuMat& dst);
+
+        //! copy asynchronously
+        void enqueueCopy(const GpuMat& src, GpuMat& dst);
+
+        //! memory set asynchronously
+        void enqueueMemSet(GpuMat& src, Scalar val);
+        void enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask);
+
+        //! converts matrix type, ex from float to uchar depending on type
+        void enqueueConvert(const GpuMat& src, GpuMat& dst, int dtype, double a = 1, double b = 0);
+
+        //! adds a callback to be called on the host after all currently enqueued items in the stream have completed
+        typedef void (*StreamCallback)(Stream& stream, int status, void* userData);
+        void enqueueHostCallback(StreamCallback callback, void* userData);
+
+        static Stream& Null();
+
+        operator bool() const;
+
+    private:
+        struct Impl;
+
+        explicit Stream(Impl* impl);
+        void create();
+        void release();
+
+        Impl *impl;
+
+        friend struct StreamAccessor;
+    };
+
     //////////////////////////////// Initialization & Info ////////////////////////
 
     //! This is the only function that do not throw exceptions if the library is compiled without Cuda.
diff --git a/modules/core/include/opencv2/core/stream_accessor.hpp b/modules/core/include/opencv2/core/stream_accessor.hpp
new file mode 100644
index 000000000..6a1a0bddd
--- /dev/null
+++ b/modules/core/include/opencv2/core/stream_accessor.hpp
@@ -0,0 +1,64 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_STREAM_ACCESSOR_HPP__
+#define __OPENCV_GPU_STREAM_ACCESSOR_HPP__
+
+#include "opencv2/gpu/gpu.hpp"
+#include "cuda_runtime_api.h"
+
+namespace cv
+{
+    namespace gpu
+    {
+        // This is only header file that depends on Cuda. All other headers are independent.
+        // So if you use OpenCV binaries you do noot need to install Cuda Toolkit.
+        // But of you wanna use GPU by yourself, may get cuda stream instance using the class below.
+        // In this case you have to install Cuda Toolkit.
+        struct StreamAccessor
+        {
+            CV_EXPORTS static cudaStream_t getStream(const Stream& stream);
+        };
+    }
+}
+
+#endif /* __OPENCV_GPU_STREAM_ACCESSOR_HPP__ */
\ No newline at end of file
diff --git a/modules/gpu/src/cudastream.cpp b/modules/core/src/cudastream.cpp
similarity index 92%
rename from modules/gpu/src/cudastream.cpp
rename to modules/core/src/cudastream.cpp
index e302fd1fc..a10807cf2 100644
--- a/modules/gpu/src/cudastream.cpp
+++ b/modules/core/src/cudastream.cpp
@@ -72,6 +72,33 @@ void cv::gpu::Stream::release() { throw_nogpu(); }
 
 #include "opencv2/gpu/stream_accessor.hpp"
 
+namespace
+{
+#if defined(__GNUC__)
+    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__, __func__)
+    #define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__, __func__)
+#else /* defined(__CUDACC__) || defined(__MSVC__) */
+    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__)
+    #define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__)
+#endif
+
+    inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
+    {
+        if (cudaSuccess != err)
+            cv::gpu::error(cudaGetErrorString(err), file, line, func);
+    }
+
+    inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
+    {
+        if (err < 0)
+        {
+            std::ostringstream msg;
+            msg << "NPP API Call Error: " << err;
+            cv::gpu::error(msg.str().c_str(), file, line, func);
+        }
+    }
+}
+
 namespace cv { namespace gpu
 {
     void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream);
diff --git a/modules/gpu/src/matrix_operations.cpp b/modules/core/src/matrix_operations.cpp
similarity index 89%
rename from modules/gpu/src/matrix_operations.cpp
rename to modules/core/src/matrix_operations.cpp
index b19524842..3b82df572 100644
--- a/modules/gpu/src/matrix_operations.cpp
+++ b/modules/core/src/matrix_operations.cpp
@@ -41,6 +41,7 @@
 //M*/
 
 #include "precomp.hpp"
+#include "opencv2/core/gpumat.hpp"
 
 using namespace cv;
 using namespace cv::gpu;
@@ -178,7 +179,7 @@ bool cv::gpu::CudaMem::empty() const
     return data == 0;
 }
 
-#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+#if !defined (HAVE_CUDA)
 
 void cv::gpu::registerPageLocked(Mat&) { throw_nogpu(); }
 void cv::gpu::unregisterPageLocked(Mat&) { throw_nogpu(); }
@@ -188,6 +189,33 @@ void cv::gpu::CudaMem::release() { throw_nogpu(); }
 GpuMat cv::gpu::CudaMem::createGpuMatHeader () const { throw_nogpu(); return GpuMat(); }
 
 #else /* !defined (HAVE_CUDA) */
+#include <cuda_runtime_api.h>
+namespace
+{
+#if defined(__GNUC__)
+    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__, __func__)
+    #define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__, __func__)
+#else /* defined(__CUDACC__) || defined(__MSVC__) */
+    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__)
+    #define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__)
+#endif
+
+    inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
+    {
+        if (cudaSuccess != err)
+            cv::gpu::error(cudaGetErrorString(err), file, line, func);
+    }
+
+    inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
+    {
+        if (err < 0)
+        {
+            std::ostringstream msg;
+            msg << "NPP API Call Error: " << err;
+            cv::gpu::error(msg.str().c_str(), file, line, func);
+        }
+    }
+}
 
 void cv::gpu::registerPageLocked(Mat& m)
 {
diff --git a/modules/gpu/include/opencv2/gpu/stream_accessor.hpp b/modules/gpu/include/opencv2/gpu/stream_accessor.hpp
index abafc613d..1797749b6 100644
--- a/modules/gpu/include/opencv2/gpu/stream_accessor.hpp
+++ b/modules/gpu/include/opencv2/gpu/stream_accessor.hpp
@@ -56,7 +56,7 @@ namespace cv
         // In this case you have to install Cuda Toolkit.
         struct StreamAccessor
         {
-            CV_EXPORTS static cudaStream_t getStream(const Stream& stream);
+            CV_EXPORTS static cudaStream_t getStream(const cv::gpu::Stream& stream);
         };
     }
 }
diff --git a/modules/softcascade/CMakeLists.txt b/modules/softcascade/CMakeLists.txt
index f19241a4a..ee6e89299 100644
--- a/modules/softcascade/CMakeLists.txt
+++ b/modules/softcascade/CMakeLists.txt
@@ -9,7 +9,7 @@ macro(ocv_glob_cuda_powered_module_sources)
   set(lib_device_hdrs "")
 
   if (HAVE_CUDA AND lib_device_srcs)
-  ocv_include_directories(${CUDA_INCLUDE_DIRS})
+  ocv_include_directories(${CUDA_INCLUDE_DIRS} "${OpenCV_SOURCE_DIR}/modules/gpu/include")
   file(GLOB_RECURSE lib_device_hdrs "src/cuda/*.hpp")
 
   ocv_cuda_compile(device_objs ${lib_device_srcs})
@@ -35,7 +35,7 @@ set(cuda_include "")
 if (NAVE_CUDA)
   set(cuda_deps ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
 endif()
-ocv_add_module(softcascade opencv_core opencv_imgproc opencv_ml OPTIONAL opencv_gpu ${cuda_deps})
+ocv_add_module(softcascade opencv_core opencv_imgproc opencv_ml OPTIONAL ${cuda_deps})
 
 if(HAVE_CUDA)
   ocv_module_include_directories(${CUDA_INCLUDE_DIRS})
diff --git a/modules/softcascade/src/cuda/icf-sc.cu b/modules/softcascade/src/cuda/icf-sc.cu
index d339ef0d3..19b20db9c 100644
--- a/modules/softcascade/src/cuda/icf-sc.cu
+++ b/modules/softcascade/src/cuda/icf-sc.cu
@@ -49,6 +49,8 @@
 
 namespace cv { namespace softcascade { namespace device {
 
+typedef unsigned char uchar;
+
     template <int FACTOR>
     __device__ __forceinline__ uchar shrink(const uchar* ptr, const int pitch, const int y, const int x)
     {
diff --git a/modules/softcascade/src/detector_cuda.cpp b/modules/softcascade/src/detector_cuda.cpp
index 6c920332f..a013a16fe 100644
--- a/modules/softcascade/src/detector_cuda.cpp
+++ b/modules/softcascade/src/detector_cuda.cpp
@@ -480,8 +480,8 @@ void cv::softcascade::SCascade::detect(InputArray _image, InputArray _rois, Outp
 
     flds.mask.create( rois.cols / shr, rois.rows / shr, rois.type());
 
-    cv::gpu::resize(rois, flds.genRoiTmp, cv::Size(), 1.f / shr, 1.f / shr, CV_INTER_AREA, s);
-    cv::gpu::transpose(flds.genRoiTmp, flds.mask, s);
+    //cv::gpu::resize(rois, flds.genRoiTmp, cv::Size(), 1.f / shr, 1.f / shr, CV_INTER_AREA, s);
+    //cv::gpu::transpose(flds.genRoiTmp, flds.mask, s);
 
     if (type == CV_8UC3)
     {
@@ -491,7 +491,7 @@ void cv::softcascade::SCascade::detect(InputArray _image, InputArray _rois, Outp
             flds.createLevels(image.rows, image.cols);
 
         flds.preprocessor->apply(image, flds.shrunk);
-        cv::gpu::integralBuffered(flds.shrunk, flds.hogluv, flds.integralBuffer, s);
+        //cv::gpu::integralBuffered(flds.shrunk, flds.hogluv, flds.integralBuffer, s);
     }
     else
     {
@@ -546,12 +546,12 @@ struct GenricPreprocessor : public cv::softcascade::ChannelsProcessor
         channels.create(frame.rows * (4 + bins), frame.cols, CV_8UC1);
         setZero(channels, s);
 
-        cv::gpu::cvtColor(frame, gray, CV_BGR2GRAY, s);
+        //cv::gpu::cvtColor(frame, gray, CV_BGR2GRAY, s);
         createHogBins(s);
 
         createLuvBins(frame, s);
 
-        cv::gpu::resize(channels, shrunk, cv::Size(), 1.f / shrinkage, 1.f / shrinkage, CV_INTER_AREA, s);
+        //cv::gpu::resize(channels, shrunk, cv::Size(), 1.f / shrinkage, 1.f / shrinkage, CV_INTER_AREA, s);
     }
 
 private:
@@ -566,20 +566,20 @@ private:
         cv::gpu::GpuMat dfdx(fplane, cv::Rect(0,  0, fw, fh));
         cv::gpu::GpuMat dfdy(fplane, cv::Rect(0, fh, fw, fh));
 
-        cv::gpu::Sobel(gray, dfdx, CV_32F, 1, 0, sobelBuf, 3, 1, cv::BORDER_DEFAULT, -1, s);
-        cv::gpu::Sobel(gray, dfdy, CV_32F, 0, 1, sobelBuf, 3, 1, cv::BORDER_DEFAULT, -1, s);
+        //cv::gpu::Sobel(gray, dfdx, CV_32F, 1, 0, sobelBuf, 3, 1, cv::BORDER_DEFAULT, -1, s);
+        //cv::gpu::Sobel(gray, dfdy, CV_32F, 0, 1, sobelBuf, 3, 1, cv::BORDER_DEFAULT, -1, s);
 
         cv::gpu::GpuMat mag(fplane, cv::Rect(0, 2 * fh, fw, fh));
         cv::gpu::GpuMat ang(fplane, cv::Rect(0, 3 * fh, fw, fh));
 
-        cv::gpu::cartToPolar(dfdx, dfdy, mag, ang, true, s);
+        //cv::gpu::cartToPolar(dfdx, dfdy, mag, ang, true, s);
 
         // normalize magnitude to uchar interval and angles to 6 bins
         cv::gpu::GpuMat nmag(fplane, cv::Rect(0, 4 * fh, fw, fh));
         cv::gpu::GpuMat nang(fplane, cv::Rect(0, 5 * fh, fw, fh));
 
-        cv::gpu::multiply(mag, cv::Scalar::all(1.f / (8 *::log(2.0f))), nmag, 1, -1, s);
-        cv::gpu::multiply(ang, cv::Scalar::all(1.f / 60.f),     nang, 1, -1, s);
+        //cv::gpu::multiply(mag, cv::Scalar::all(1.f / (8 *::log(2.0f))), nmag, 1, -1, s);
+        //cv::gpu::multiply(ang, cv::Scalar::all(1.f / 60.f),     nang, 1, -1, s);
 
         //create uchar magnitude
         cv::gpu::GpuMat cmag(channels, cv::Rect(0, fh * HOG_BINS, fw, fh));
@@ -597,7 +597,7 @@ private:
         static const int fw = colored.cols;
         static const int fh = colored.rows;
 
-        cv::gpu::cvtColor(colored, luv, CV_BGR2Luv, s);
+        //cv::gpu::cvtColor(colored, luv, CV_BGR2Luv, s);
 
         std::vector<cv::gpu::GpuMat> splited;
         for(int i = 0; i < LUV_BINS; ++i)
@@ -605,7 +605,7 @@ private:
             splited.push_back(cv::gpu::GpuMat(channels, cv::Rect(0, fh * (7 + i), fw, fh)));
         }
 
-        cv::gpu::split(luv, splited, s);
+        //cv::gpu::split(luv, splited, s);
     }
 
     enum {HOG_BINS = 6, LUV_BINS = 3};
@@ -631,7 +631,7 @@ struct SeparablePreprocessor : public cv::softcascade::ChannelsProcessor
     virtual void apply(InputArray _frame, OutputArray _shrunk, cv::gpu::Stream& s = cv::gpu::Stream::Null())
     {
         const cv::gpu::GpuMat frame = _frame.getGpuMat();
-        cv::gpu::GaussianBlur(frame, bgr, cv::Size(3, 3), -1.0);
+        //cv::gpu::GaussianBlur(frame, bgr, cv::Size(3, 3), -1.0);
 
         _shrunk.create(frame.rows * (4 + bins) / shrinkage, frame.cols / shrinkage, CV_8UC1);
         cv::gpu::GpuMat shrunk = _shrunk.getGpuMat();
@@ -639,7 +639,7 @@ struct SeparablePreprocessor : public cv::softcascade::ChannelsProcessor
         channels.create(frame.rows * (4 + bins), frame.cols, CV_8UC1);
         setZero(channels, s);
 
-        cv::gpu::cvtColor(bgr, gray, CV_BGR2GRAY);
+        //cv::gpu::cvtColor(bgr, gray, CV_BGR2GRAY);
         cv::softcascade::device::gray2hog(gray, channels(cv::Rect(0, 0, bgr.cols, bgr.rows * (bins + 1))), bins);
 
         cv::gpu::GpuMat luv(channels, cv::Rect(0, bgr.rows * (bins + 1), bgr.cols, bgr.rows * 3));