added gpu transpose and integral based on NPP Staging.

added mask support to SURF_GPU.
2010-12-21 14:02:09 +00:00
parent 457c6a8dfe
commit 0cd587ee34
7 changed files with 147 additions and 118 deletions
--- a/modules/gpu/src/cuda/mathfunc.cu
+++ b/modules/gpu/src/cuda/mathfunc.cu
@@ -214,44 +214,6 @@ namespace cv { namespace gpu { namespace mathfunc

        callers[mag.data == 0](mag, angle, x, y, angleInDegrees, stream);
    }
-
-
-//////////////////////////////////////////////////////////////////////////////////////////////////////////
-// transpose
-
-    __global__ void transpose(const DevMem2Di src, PtrStepi dst)
-    {
-    	__shared__ int s_mem[16 * 17];
-
-    	int x = blockIdx.x * blockDim.x + threadIdx.x;
-    	int y = blockIdx.y * blockDim.y + threadIdx.y;
-	    int smem_idx = threadIdx.y * blockDim.x + threadIdx.x + threadIdx.y;
-
-	    if (y < src.rows && x < src.cols)
-	    {
-            s_mem[smem_idx] = src.ptr(y)[x];
-	    }
-	    __syncthreads();
-
-	    smem_idx = threadIdx.x * blockDim.x + threadIdx.y + threadIdx.x;
-
-	    x = blockIdx.y * blockDim.x + threadIdx.x;
-	    y = blockIdx.x * blockDim.y + threadIdx.y;
-
-	    if (y < src.cols && x < src.rows)
-	    {
-		    dst.ptr(y)[x] = s_mem[smem_idx];
-	    }
-    }
-
-    void transpose_gpu(const DevMem2Di& src, const DevMem2Di& dst)
-    {
-	    dim3 threads(16, 16, 1);
-	    dim3 grid(divUp(src.cols, 16), divUp(src.rows, 16), 1);
-
-	    transpose<<<grid, threads>>>(src, dst);
-        cudaSafeCall( cudaThreadSynchronize() );
-    }
 }}}


--- a/modules/gpu/src/cuda/surf.cu
+++ b/modules/gpu/src/cuda/surf.cu
@@ -259,7 +259,36 @@ namespace cv { namespace gpu { namespace surf

    ////////////////////////////////////////////////////////////////////////
    // NONMAX
+    
+    texture<int, 2, cudaReadModeElementType> maskSumTex(0, cudaFilterModePoint, cudaAddressModeClamp);

+    struct WithOutMask
+    {
+        static __device__ bool check(float, float, float)
+        {
+            return true;
+        }
+    };
+    struct WithMask
+    {
+        static __device__ bool check(float x, float y, float fscale)
+        {
+	        float half_width = fscale / 2;
+    
+	        float result = 0.f;
+
+            result += tex2D(maskSumTex, x - half_width, y - half_width);
+            result -= tex2D(maskSumTex, x + half_width, y - half_width);
+            result -= tex2D(maskSumTex, x - half_width, y + half_width);
+            result += tex2D(maskSumTex, x + half_width, y + half_width);
+	
+	        result /= (fscale * fscale);
+
+            return (result >= 0.5f);
+        }
+    };
+
+    template <typename Mask>
    __global__ void nonmaxonly(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int* maxCounter)
    {        
        #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
@@ -287,7 +316,12 @@ namespace cv { namespace gpu { namespace surf

        float val = fh_vals[localLin];

-        if (inBounds2 && val >= c_threshold)
+	    // Compute the lookup location of the mask center
+        float x = hidx_x * c_step + c_border;
+        float y = hidx_y * c_step + c_border;
+        float fscale = calcScale(hidx_z);
+
+        if (inBounds2 && val >= c_threshold && Mask::check(x, y, fscale))
        {
            // Check to see if we have a max (in its 26 neighbours)
            int zoff = blockDim.x * blockDim.y;
@@ -337,7 +371,7 @@ namespace cv { namespace gpu { namespace surf
    }

    void nonmaxonly_gpu(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int& maxCounter, 
-        int nIntervals, int x_size, int y_size)
+        int nIntervals, int x_size, int y_size, bool use_mask)
    {
        dim3 threads;
        threads.x = 16;
@@ -353,7 +387,10 @@ namespace cv { namespace gpu { namespace surf

        DeviceReference<unsigned int> maxCounterWrapper(maxCounter);

-        nonmaxonly<<<grid, threads, smem_size>>>(hessianBuffer, maxPosBuffer, maxCounterWrapper);
+        if (use_mask)
+            nonmaxonly<WithMask><<<grid, threads, smem_size>>>(hessianBuffer, maxPosBuffer, maxCounterWrapper);
+        else
+            nonmaxonly<WithOutMask><<<grid, threads, smem_size>>>(hessianBuffer, maxPosBuffer, maxCounterWrapper);

        cudaSafeCall( cudaThreadSynchronize() );
    }