added registerPageLocked/unregisterPageLocked functions

added convert functions to BruteForceMatcher_GPU other minor fixes
2011-08-10 11:32:48 +00:00
parent 24279c2c11
commit 8bb9e4302e
18 changed files with 276 additions and 228 deletions
--- a/modules/gpu/src/cuda/brute_force_matcher.cu
+++ b/modules/gpu/src/cuda/brute_force_matcher.cu
@@ -56,9 +56,8 @@ namespace cv { namespace gpu { namespace bfmatcher
    ///////////////////////////////////////////////////////////////////////////////
    // Mask strategy

-    class SingleMask
+    struct SingleMask
    {
-    public:
        explicit SingleMask(const PtrStep& mask_) : mask(mask_) {}
        
        __device__ __forceinline__ bool operator()(int queryIdx, int trainIdx) const
@@ -66,13 +65,11 @@ namespace cv { namespace gpu { namespace bfmatcher
            return mask.ptr(queryIdx)[trainIdx] != 0;
        }

-    private:
-        PtrStep mask;
+        const PtrStep mask;
    };

-    class MaskCollection
+    struct MaskCollection
    {
-    public:
        explicit MaskCollection(PtrStep* maskCollection_) : maskCollection(maskCollection_) {}

        __device__ __forceinline__ void nextMask()
@@ -86,15 +83,14 @@ namespace cv { namespace gpu { namespace bfmatcher
            return curMask.data == 0 || (ForceGlob<uchar>::Load(curMask.ptr(queryIdx), trainIdx, val), (val != 0));
        }

-    private:
-        PtrStep* maskCollection;
+        const PtrStep* maskCollection;
        PtrStep curMask;
    };

    class WithOutMask
    {
    public:
-        __device__ __forceinline__ void nextMask()
+        __device__ __forceinline__ void nextMask() const
        {
        }
        __device__ __forceinline__ bool operator()(int queryIdx, int trainIdx) const
@@ -128,9 +124,8 @@ namespace cv { namespace gpu { namespace bfmatcher
    ///////////////////////////////////////////////////////////////////////////////
    // Distance

-    template <typename T> class L1Dist
+    template <typename T> struct L1Dist
    {
-    public:
        typedef int ResultType;
        typedef int ValueType;

@@ -151,12 +146,10 @@ namespace cv { namespace gpu { namespace bfmatcher
            return mySum;
        }

-    private:
        int mySum;
    };
-    template <> class L1Dist<float>
+    template <> struct L1Dist<float>
    {
-    public:
        typedef float ResultType;
        typedef float ValueType;

@@ -177,13 +170,11 @@ namespace cv { namespace gpu { namespace bfmatcher
            return mySum;
        }

-    private:
        float mySum;
    };

-    class L2Dist
+    struct L2Dist
    {
-    public:
        typedef float ResultType;
        typedef float ValueType;

@@ -205,13 +196,11 @@ namespace cv { namespace gpu { namespace bfmatcher
            return sqrtf(mySum);
        }

-    private:
        float mySum;
    };

-    class HammingDist
+    struct HammingDist
    {
-    public:
        typedef int ResultType;
        typedef int ValueType;

@@ -232,7 +221,6 @@ namespace cv { namespace gpu { namespace bfmatcher
            return mySum;
        }

-    private:
        int mySum;
    };
    
@@ -425,10 +413,8 @@ namespace cv { namespace gpu { namespace bfmatcher
    ///////////////////////////////////////////////////////////////////////////////
    // ReduceDescCalculator

-    template <int BLOCK_DIM_X, typename T>
-    class ReduceDescCalculatorSimple
+    template <int BLOCK_DIM_X, typename T> struct ReduceDescCalculatorSimple
    {
-    public:
        __device__ __forceinline__ void prepare(const T* queryDescs_, int, void*)
        {
            queryDescs = queryDescs_;
@@ -440,14 +426,12 @@ namespace cv { namespace gpu { namespace bfmatcher
            reduceDescDiff<BLOCK_DIM_X>(queryDescs, trainDescs, desc_len, dist, sdiff_row);
        }

-    private:
        const T* queryDescs;
    };

    template <int BLOCK_DIM_X, int MAX_DESCRIPTORS_LEN, bool DESC_LEN_EQ_MAX_LEN, typename T, typename U>
-    class ReduceDescCalculatorCached
+    struct ReduceDescCalculatorCached
    {
-    public:
        __device__ __forceinline__ void prepare(const T* queryDescs, int desc_len, U* smem)
        {
            loadDescsVals<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN>(queryDescs, desc_len, queryVals, smem);
@@ -459,7 +443,6 @@ namespace cv { namespace gpu { namespace bfmatcher
            reduceDescDiffCached<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, DESC_LEN_EQ_MAX_LEN>(queryVals, trainDescs, desc_len, dist, sdiff_row);
        }

-    private:
        U queryVals[MAX_DESCRIPTORS_LEN / BLOCK_DIM_X];
    };
    
@@ -497,10 +480,8 @@ namespace cv { namespace gpu { namespace bfmatcher
    ///////////////////////////////////////////////////////////////////////////////
    // Train collection loop strategy

-    template <typename T>
-    class SingleTrain
+    template <typename T> struct SingleTrain
    {
-    public:
        explicit SingleTrain(const DevMem2D_<T>& trainDescs_) : trainDescs(trainDescs_)
        {
        }
@@ -517,14 +498,11 @@ namespace cv { namespace gpu { namespace bfmatcher
            return trainDescs.cols;
        }

-    private:
-        DevMem2D_<T> trainDescs;
+        const DevMem2D_<T> trainDescs;
    };

-    template <typename T>
-    class TrainCollection
+    template <typename T> struct TrainCollection
    {
-    public:
        TrainCollection(const DevMem2D_<T>* trainCollection_, int nImg_, int desclen_) : 
            trainCollection(trainCollection_), nImg(nImg_), desclen(desclen_)
        {
@@ -536,7 +514,7 @@ namespace cv { namespace gpu { namespace bfmatcher
        {
            for (int imgIdx = 0; imgIdx < nImg; ++imgIdx)
            {
-                DevMem2D_<T> trainDescs = trainCollection[imgIdx];
+                const DevMem2D_<T> trainDescs = trainCollection[imgIdx];
                m.nextMask();
                matchDescs<Dist>(queryIdx, imgIdx, trainDescs, m, reduceDescCalc, myMin, myBestTrainIdx, myBestImgIdx, sdiff_row);
            }
@@ -547,7 +525,6 @@ namespace cv { namespace gpu { namespace bfmatcher
            return desclen;
        }

-    private:
        const DevMem2D_<T>* trainCollection;
        int nImg;
        int desclen;
@@ -806,7 +783,7 @@ namespace cv { namespace gpu { namespace bfmatcher
    // Calc distance kernel

    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
-    __global__ void calcDistance(PtrStep_<T> queryDescs_, DevMem2D_<T> trainDescs_, Mask mask, PtrStepf distance)
+    __global__ void calcDistance(const PtrStep_<T> queryDescs_, const DevMem2D_<T> trainDescs_, const Mask mask, PtrStepf distance)
    {
        __shared__ typename Dist::ResultType sdiff[BLOCK_DIM_X * BLOCK_DIM_Y];

@@ -989,8 +966,7 @@ namespace cv { namespace gpu { namespace bfmatcher
    ///////////////////////////////////////////////////////////////////////////////
    // find knn match kernel

-    template <int BLOCK_SIZE>
-    __global__ void findBestMatch(DevMem2Df allDist_, int i, PtrStepi trainIdx_, PtrStepf distance_)
+    template <int BLOCK_SIZE> __global__ void findBestMatch(DevMem2Df allDist_, int i, PtrStepi trainIdx_, PtrStepf distance_)
    {
        const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64;
        __shared__ float sdist[SMEM_SIZE];
@@ -1130,8 +1106,8 @@ namespace cv { namespace gpu { namespace bfmatcher
    // Radius Match kernel

    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
-    __global__ void radiusMatch(PtrStep_<T> queryDescs_, DevMem2D_<T> trainDescs_, 
-        float maxDistance, Mask mask, DevMem2Di trainIdx_, unsigned int* nMatches, PtrStepf distance)
+    __global__ void radiusMatch(const PtrStep_<T> queryDescs_, const DevMem2D_<T> trainDescs_, 
+        float maxDistance, const Mask mask, DevMem2Di trainIdx_, unsigned int* nMatches, PtrStepf distance)
    {
        #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110

--- a/modules/gpu/src/cuda/calib3d.cu
+++ b/modules/gpu/src/cuda/calib3d.cu
@@ -42,6 +42,7 @@

 #include "internal_shared.hpp"
 #include "opencv2/gpu/device/transform.hpp"
+#include "opencv2/gpu/device/functional.hpp"

 #define SOLVE_PNP_RANSAC_MAX_NUM_ITERS 200

@@ -56,9 +57,9 @@ namespace cv { namespace gpu
        __constant__ float3 crot2;
        __constant__ float3 ctransl;

-        struct TransformOp
+        struct TransformOp : unary_function<float3, float3>
        {
-            __device__ __forceinline__ float3 operator()(float3 p) const
+            __device__ __forceinline__ float3 operator()(const float3& p) const
            {
                return make_float3(
                        crot0.x * p.x + crot0.y * p.y + crot0.z * p.z + ctransl.x,
@@ -89,9 +90,9 @@ namespace cv { namespace gpu
        __constant__ float3 cproj0;
        __constant__ float3 cproj1;

-        struct ProjectOp
+        struct ProjectOp : unary_function<float3, float3>
        {
-            __device__ __forceinline__ float2 operator()(float3 p) const
+            __device__ __forceinline__ float2 operator()(const float3& p) const
            {
                // Rotate and translate in 3D
                float3 t = make_float3(
--- a/modules/gpu/src/cuda/canny.cu
+++ b/modules/gpu/src/cuda/canny.cu
@@ -49,7 +49,7 @@ using namespace cv::gpu::device;

 namespace cv { namespace gpu { namespace canny
 {
-    __global__ void calcSobelRowPass(PtrStep src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
+    __global__ void calcSobelRowPass(const PtrStep src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
    {
        __shared__ int smem[16][18];

@@ -100,7 +100,8 @@ namespace cv { namespace gpu { namespace canny
        }
    };

-    template <typename Norm> __global__ void calcMagnitude(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
+    template <typename Norm> __global__ void calcMagnitude(const PtrStepi dx_buf, const PtrStepi dy_buf, 
+        PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
    {
        __shared__ int sdx[18][16];
        __shared__ int sdy[18][16];
@@ -179,7 +180,7 @@ namespace cv { namespace gpu { namespace canny
 #define CANNY_SHIFT 15
 #define TG22        (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5)

-    __global__ void calcMap(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)
+    __global__ void calcMap(const PtrStepi dx, const PtrStepi dy, const PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)
    {
        __shared__ float smem[18][18];

--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
@@ -56,10 +56,9 @@ namespace cv { namespace gpu { namespace mathfunc
    //////////////////////////////////////////////////////////////////////////////////////
    // Compare

-    template <typename T1, typename T2>
-    struct NotEqual
+    template <typename T1, typename T2> struct NotEqual : binary_function<T1, T2, uchar>
    {
-        __device__ __forceinline__ uchar operator()(const T1& src1, const T2& src2)
+        __device__ __forceinline__ uchar operator()(const T1& src1, const T2& src2) const
        {
            return static_cast<uchar>(static_cast<int>(src1 != src2) * 255);
        }
@@ -467,8 +466,7 @@ namespace cv { namespace gpu { namespace mathfunc
    //////////////////////////////////////////////////////////////////////////
    // pow
    
-    template<typename T, bool Signed = device::numeric_limits<T>::is_signed>
-    struct PowOp
+    template<typename T, bool Signed = device::numeric_limits<T>::is_signed> struct PowOp : unary_function<T, T>
    {    
        float power;
        PowOp(float power_) : power(power_) {}
@@ -479,13 +477,12 @@ namespace cv { namespace gpu { namespace mathfunc
        }      
    };

-    template<typename T>
-    struct PowOp<T, true>
+    template<typename T> struct PowOp<T, true> : unary_function<T, T>
    {
        float power;
        PowOp(float power_) : power(power_) {}

-        __device__ __forceinline__ float operator()(const T& e)
+        __device__ __forceinline__ float operator()(const T& e) const
        {
            T res = saturate_cast<T>(__powf((float)e, power));            
            
@@ -495,13 +492,12 @@ namespace cv { namespace gpu { namespace mathfunc
        }
    };

-    template<>
-    struct PowOp<float>
+    template<> struct PowOp<float> : unary_function<float, float>
    {
        float power;
        PowOp(float power_) : power(power_) {}

-        __device__ __forceinline__ float operator()(const float& e)
+        __device__ __forceinline__ float operator()(const float& e) const
        {
            return __powf(fabs(e), power);
        }
--- a/modules/gpu/src/cuda/hist.cu
+++ b/modules/gpu/src/cuda/hist.cu
@@ -105,7 +105,7 @@ namespace cv { namespace gpu { namespace histograms
        if (x + 3 < cols) addByte(s_WarpHist, (data >> 24) & 0xFFU, tag);
    }

-    __global__ void histogram256(PtrStep_<uint> d_Data, uint* d_PartialHistograms, uint dataCount, uint cols)
+    __global__ void histogram256(const PtrStep_<uint> d_Data, uint* d_PartialHistograms, uint dataCount, uint cols)
    {
        //Per-warp subhistogram storage
        __shared__ uint s_Hist[HISTOGRAM256_THREADBLOCK_MEMORY];
@@ -189,21 +189,18 @@ namespace cv { namespace gpu { namespace histograms
            cudaSafeCall( cudaDeviceSynchronize() );
    }

-    __global__ void equalizeHist(DevMem2D src, PtrStep dst, const int* lut)
+    __constant__ int c_lut[256];
+
+    __global__ void equalizeHist(const DevMem2D src, PtrStep dst)
    {
-        __shared__ int s_lut[256];
-
-        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
-
-        s_lut[tid] = lut[tid];
-        __syncthreads();
-
        const int x = blockIdx.x * blockDim.x + threadIdx.x;
        const int y = blockIdx.y * blockDim.y + threadIdx.y;

        if (x < src.cols && y < src.rows)
        {
-            dst.ptr(y)[x] = __float2int_rn(255.0f * s_lut[src.ptr(y)[x]] / (src.cols * src.rows));
+            const uchar val = src.ptr(y)[x];
+            const int lut = c_lut[val];
+            dst.ptr(y)[x] = __float2int_rn(255.0f / (src.cols * src.rows) * lut);
        }
    }

@@ -212,7 +209,9 @@ namespace cv { namespace gpu { namespace histograms
        dim3 block(16, 16);
        dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));

-        equalizeHist<<<grid, block, 0, stream>>>(src, dst, lut);
+        cudaSafeCall( cudaMemcpyToSymbol(cv::gpu::histograms::c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice) );
+
+        equalizeHist<<<grid, block, 0, stream>>>(src, dst);
        cudaSafeCall( cudaGetLastError() );

        if (stream == 0)
--- a/modules/gpu/src/cuda/imgproc.cu
+++ b/modules/gpu/src/cuda/imgproc.cu
@@ -49,7 +49,7 @@ using namespace cv::gpu::device;
 /////////////////////////////////// Remap ///////////////////////////////////////////////
 namespace cv { namespace gpu { namespace imgproc
 {
-    texture<unsigned char, 2, cudaReadModeNormalizedFloat> tex_remap;
+    texture<unsigned char, 2, cudaReadModeNormalizedFloat> tex_remap(0, cudaFilterModeLinear, cudaAddressModeWrap);

    __global__ void remap_1c(const float* mapx, const float* mapy, size_t map_step, uchar* out, size_t out_step, int width, int height)
    {    
@@ -131,16 +131,12 @@ namespace cv { namespace gpu { namespace imgproc
        grid.x = divUp(dst.cols, threads.x);
        grid.y = divUp(dst.rows, threads.y);

-        tex_remap.filterMode = cudaFilterModeLinear;
-        tex_remap.addressMode[0] = tex_remap.addressMode[1] = cudaAddressModeWrap;
-        cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
-        cudaSafeCall( cudaBindTexture2D(0, tex_remap, src.data, desc, src.cols, src.rows, src.step) );
+        TextureBinder tex_remap(&tex_remap, src);

        remap_1c<<<grid, threads>>>(xmap.data, ymap.data, xmap.step, dst.data, dst.step, dst.cols, dst.rows);
        cudaSafeCall( cudaGetLastError() );

        cudaSafeCall( cudaDeviceSynchronize() );
-        cudaSafeCall( cudaUnbindTexture(tex_remap) );
    }
    
    void remap_gpu_3c(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, DevMem2D dst)
@@ -151,8 +147,8 @@ namespace cv { namespace gpu { namespace imgproc
        grid.y = divUp(dst.rows, threads.y);

        remap_3c<<<grid, threads>>>(src.data, src.step, xmap.data, ymap.data, xmap.step, dst.data, dst.step, dst.cols, dst.rows);
-
        cudaSafeCall( cudaGetLastError() );
+
        cudaSafeCall( cudaDeviceSynchronize() );
    }

--- a/modules/gpu/src/cuda/internal_shared.hpp
+++ b/modules/gpu/src/cuda/internal_shared.hpp
@@ -77,7 +77,6 @@ namespace cv
        // Returns true if the GPU analogue exists, false otherwise.
        bool tryConvertToGpuBorderType(int cpuBorderType, int& gpuBorderType);

-
        static inline int divUp(int total, int grain) { return (total + grain - 1) / grain; }

        template<class T> static inline void uploadConstant(const char* name, const T& value) 
@@ -117,6 +116,49 @@ namespace cv
            cudaSafeCall( cudaUnbindTexture(tex) );
        }

+        class TextureBinder
+        {
+        public:
+            TextureBinder() : tex_(0) {}
+            template <typename T> TextureBinder(const textureReference* tex, const DevMem2D_<T>& img) : tex_(0)
+            {
+                bind(tex, img);
+            }
+            template <typename T> TextureBinder(const char* tex_name, const DevMem2D_<T>& img) : tex_(0)
+            {
+                bind(tex_name, img);
+            }
+            ~TextureBinder() { unbind(); }
+
+            template <typename T> void bind(const textureReference* tex, const DevMem2D_<T>& img)
+            {
+                unbind();
+
+                cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
+                cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
+
+                tex_ = tex;
+            }
+            template <typename T> void bind(const char* tex_name, const DevMem2D_<T>& img)
+            {
+                const textureReference* tex; 
+                cudaSafeCall( cudaGetTextureReference(&tex, tex_name) ); 
+                bind(tex, img);
+            }
+
+            void unbind()
+            {
+                if (tex_)
+                {
+                    cudaUnbindTexture(tex_);
+                    tex_ = 0;
+                }
+            }
+
+        private:
+            const textureReference* tex_;
+        };
+
        class NppStreamHandler
        {
        public:
--- a/modules/gpu/src/cuda/matrix_operations.cu
+++ b/modules/gpu/src/cuda/matrix_operations.cu
@@ -43,6 +43,7 @@
 #include "internal_shared.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"
 #include "opencv2/gpu/device/transform.hpp"
+#include "opencv2/gpu/device/functional.hpp"

 using namespace cv::gpu::device;

@@ -62,7 +63,7 @@ namespace cv { namespace gpu { namespace matrix_operations {
 ///////////////////////////////////////////////////////////////////////////

    template<typename T>
-    __global__ void copy_to_with_mask(T * mat_src, T * mat_dst, const unsigned char * mask, int cols, int rows, size_t step_mat, size_t step_mask, int channels)
+    __global__ void copy_to_with_mask(const T* mat_src, T* mat_dst, const uchar* mask, int cols, int rows, size_t step_mat, size_t step_mask, int channels)
    {
        size_t x = blockIdx.x * blockDim.x + threadIdx.x;
        size_t y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -162,7 +163,7 @@ namespace cv { namespace gpu { namespace matrix_operations {
    }

    template<typename T>
-    __global__ void set_to_without_mask(T * mat, int cols, int rows, size_t step, int channels)
+    __global__ void set_to_without_mask(T* mat, int cols, int rows, size_t step, int channels)
    {
        size_t x = blockIdx.x * blockDim.x + threadIdx.x;
        size_t y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -175,7 +176,7 @@ namespace cv { namespace gpu { namespace matrix_operations {
    }

    template<typename T>
-    __global__ void set_to_with_mask(T * mat, const unsigned char * mask, int cols, int rows, size_t step, int channels, size_t step_mask)
+    __global__ void set_to_with_mask(T* mat, const uchar* mask, int cols, int rows, size_t step, int channels, size_t step_mask)
    {
        size_t x = blockIdx.x * blockDim.x + threadIdx.x;
        size_t y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -237,19 +238,16 @@ namespace cv { namespace gpu { namespace matrix_operations {
 //////////////////////////////// ConvertTo ////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////

-    template <typename T, typename D>
-    class Convertor
+    template <typename T, typename D> struct Convertor : unary_function<T, D>
    {
-    public:
        Convertor(double alpha_, double beta_) : alpha(alpha_), beta(beta_) {}

-        __device__ __forceinline__ D operator()(const T& src)
+        __device__ __forceinline__ D operator()(const T& src) const
        {
            return saturate_cast<D>(alpha * src + beta);
        }

-    private:
-        double alpha, beta;
+        const double alpha, beta;
    };
    
    template<typename T, typename D>
--- a/modules/gpu/src/cuda/surf.cu
+++ b/modules/gpu/src/cuda/surf.cu
@@ -225,7 +225,7 @@ namespace cv { namespace gpu { namespace surf
    };

    template <typename Mask>
-    __global__ void icvFindMaximaInLayer(PtrStepf det, PtrStepf trace, int4* maxPosBuffer, unsigned int* maxCounter)
+    __global__ void icvFindMaximaInLayer(const PtrStepf det, const PtrStepf trace, int4* maxPosBuffer, unsigned int* maxCounter)
    {
        #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110

@@ -346,7 +346,7 @@ namespace cv { namespace gpu { namespace surf
    ////////////////////////////////////////////////////////////////////////
    // INTERPOLATION
    
-    __global__ void icvInterpolateKeypoint(PtrStepf det, const int4* maxPosBuffer,
+    __global__ void icvInterpolateKeypoint(const PtrStepf det, const int4* maxPosBuffer,
        float* featureX, float* featureY, int* featureLaplacian, float* featureSize, float* featureHessian,
        unsigned int* featureCounter)
    {