From 924670d32c5bb3cebc83dc582067b95a597e8fcf Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <no@email>
Date: Wed, 9 Feb 2011 09:11:11 +0000
Subject: [PATCH] fixed block size calculation in SURF_GPU (fasthessian_gpu and
 nonmaxonly_gpu kernels)

---
 modules/gpu/src/cuda/surf.cu          | 40 ++++++++++++++-------------
 modules/gpu/src/surf.cpp              | 15 ++++++----
 samples/gpu/surf_keypoint_matcher.cpp |  5 ++++
 tests/gpu/src/features2d.cpp          |  2 ++
 4 files changed, 37 insertions(+), 25 deletions(-)
diff --git a/modules/gpu/src/cuda/surf.cu b/modules/gpu/src/cuda/surf.cu
index 00f62d475..cb47f2822 100644
--- a/modules/gpu/src/cuda/surf.cu
+++ b/modules/gpu/src/cuda/surf.cu
@@ -237,20 +237,31 @@ namespace cv { namespace gpu { namespace surf
 
 	        hessianBuffer.ptr(c_y_size * hidx_z + hidx_y)[hidx_x] = result;
         }
-    }   
-
-    void fasthessian_gpu(PtrStepf hessianBuffer, int nIntervals, int x_size, int y_size)
+    }
+    
+    dim3 calcBlockSize(int nIntervals)
     {
-        dim3 threads;
-        threads.x = 16;
-        threads.y = 8;
+        int threadsPerBlock = 512;
+        
+        dim3 threads;        
         threads.z = nIntervals;
+        threadsPerBlock /= nIntervals;
+        if (threadsPerBlock >= 48)
+            threads.x = 16;
+        else
+            threads.x = 8;
+        threadsPerBlock /= threads.x;
+        threads.y = threadsPerBlock;
+        
+        return threads;
+    }
 
+    void fasthessian_gpu(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threads)
+    {
         dim3 grid;
         grid.x = divUp(x_size, threads.x);
         grid.y = divUp(y_size, threads.y);
-        grid.z = 1;
-
+        
   	    fasthessian<<<grid, threads>>>(hessianBuffer);
 
         cudaSafeCall( cudaThreadSynchronize() );
@@ -370,17 +381,11 @@ namespace cv { namespace gpu { namespace surf
     }
 
     void nonmaxonly_gpu(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int& maxCounter, 
-        int nIntervals, int x_size, int y_size, bool use_mask)
+        int x_size, int y_size, bool use_mask, const dim3& threads)
     {
-        dim3 threads;
-        threads.x = 16;
-        threads.y = 8;
-        threads.z = nIntervals;
-
         dim3 grid;
         grid.x = divUp(x_size, threads.x - 2);
         grid.y = divUp(y_size, threads.y - 2);
-        grid.z = 1;
 
         const size_t smem_size = threads.x * threads.y * threads.z * sizeof(float);
 
@@ -565,8 +570,6 @@ namespace cv { namespace gpu { namespace surf
     
         dim3 grid;
         grid.x = maxCounter;
-        grid.y = 1; 
-        grid.z = 1;
 
         DeviceReference<unsigned int> featureCounterWrapper(featureCounter);
     
@@ -624,6 +627,7 @@ namespace cv { namespace gpu { namespace surf
 	    // - SURF says to only use a circle, but the branching logic would slow it down
 	    // - Gaussian weighting should reduce the effects of the outer points anyway
         if (tid2 < 169)
+
         {
 	        dx -=     texLookups[threadIdx.x    ][threadIdx.y    ];
 	        dx += 2.f*texLookups[threadIdx.x + 2][threadIdx.y    ];
@@ -709,8 +713,6 @@ namespace cv { namespace gpu { namespace surf
 
         dim3 grid;
         grid.x = nFeatures;
-        grid.y = 1;
-        grid.z = 1;
 
         find_orientation<<<grid, threads>>>(features);
         cudaSafeCall( cudaThreadSynchronize() );
diff --git a/modules/gpu/src/surf.cpp b/modules/gpu/src/surf.cpp
index b5ab0d902..145ca4992 100644
--- a/modules/gpu/src/surf.cpp
+++ b/modules/gpu/src/surf.cpp
@@ -61,11 +61,13 @@ void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, vector<KeyPoint
 #else /* !defined (HAVE_CUDA) */
 
 namespace cv { namespace gpu { namespace surf
-{    
-    void fasthessian_gpu(PtrStepf hessianBuffer, int nIntervals, int x_size, int y_size);
+{
+    dim3 calcBlockSize(int nIntervals);
+    
+    void fasthessian_gpu(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threads);
     
     void nonmaxonly_gpu(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int& maxCounter, 
-        int nIntervals, int x_size, int y_size, bool use_mask);
+        int x_size, int y_size, bool use_mask, const dim3& threads);
     
     void fh_interp_extremum_gpu(PtrStepf hessianBuffer, const int4* maxPosBuffer, unsigned int maxCounter, 
         KeyPoint_GPU* featuresBuffer, unsigned int& featureCounter);
@@ -103,7 +105,7 @@ namespace
         {
             CV_Assert(!img.empty() && img.type() == CV_8UC1);
             CV_Assert(mask.empty() || (mask.size() == img.size() && mask.type() == CV_8UC1));
-            CV_Assert(nOctaves > 0 && nIntervals > 2);
+            CV_Assert(nOctaves > 0 && nIntervals > 2 && nIntervals < 22);
             CV_Assert(DeviceInfo().has(ATOMICS));
 
             max_features = static_cast<int>(img.size().area() * featuresRatio);
@@ -168,6 +170,7 @@ namespace
 
         void detectKeypoints(GpuMat& keypoints)
         {
+            dim3 threads = calcBlockSize(nIntervals);
             for(int octave = 0; octave < nOctaves; ++octave)
             {
                 int step = initialStep * (1 << octave);
@@ -189,12 +192,12 @@ namespace
                 uploadConstant("cv::gpu::surf::c_border", border);
                 uploadConstant("cv::gpu::surf::c_step",   step);
 
-                fasthessian_gpu(hessianBuffer, nIntervals, x_size, y_size);
+                fasthessian_gpu(hessianBuffer, x_size, y_size, threads);
 
                 // Reset the candidate count.
                 maxCounter = 0;
 
-                nonmaxonly_gpu(hessianBuffer, maxPosBuffer.ptr<int4>(), maxCounter, nIntervals, x_size, y_size, use_mask); 
+                nonmaxonly_gpu(hessianBuffer, maxPosBuffer.ptr<int4>(), maxCounter, x_size, y_size, use_mask, threads); 
                 
                 maxCounter = std::min(maxCounter, static_cast<unsigned int>(max_candidates));
 
diff --git a/samples/gpu/surf_keypoint_matcher.cpp b/samples/gpu/surf_keypoint_matcher.cpp
index c109ee70f..b2c93852e 100644
--- a/samples/gpu/surf_keypoint_matcher.cpp
+++ b/samples/gpu/surf_keypoint_matcher.cpp
@@ -38,6 +38,9 @@ int main(int argc, char* argv[])
     GpuMat descriptors1GPU, descriptors2GPU;
     surf(img1, GpuMat(), keypoints1GPU, descriptors1GPU);
     surf(img2, GpuMat(), keypoints2GPU, descriptors2GPU);
+    
+    cout << "FOUND " << keypoints1GPU.cols << " keypoints on first image" << endl;
+    cout << "FOUND " << keypoints2GPU.cols << " keypoints on second image" << endl;
 
     // matching descriptors
     BruteForceMatcher_GPU< L2<float> > matcher;
@@ -57,6 +60,8 @@ int main(int argc, char* argv[])
     // drawing the results
     Mat img_matches;
     drawMatches(img1, keypoints1, img2, keypoints2, matches, img_matches);
+    
+    namedWindow("matches", 0);
     imshow("matches", img_matches);
     waitKey(0);
 
diff --git a/tests/gpu/src/features2d.cpp b/tests/gpu/src/features2d.cpp
index 8a754478e..99fb28d4c 100644
--- a/tests/gpu/src/features2d.cpp
+++ b/tests/gpu/src/features2d.cpp
@@ -149,12 +149,14 @@ void CV_GPU_SURFTest::compareKeypointSets(const vector<KeyPoint>& validKeypoints
         assert(minDist >= 0);
         if (!isSimilarKeypoints(validKeypoints[v], calcKeypoints[nearestIdx]))
         {
+            ts->printf(CvTS::LOG, "Bad keypoints accuracy.\n");
             ts->set_failed_test_info( CvTS::FAIL_BAD_ACCURACY );
             return;
         }
 
         if (norm(validDescriptors.row(v), calcDescriptors.row(nearestIdx), NORM_L2) > 1.0f)
         {
+            ts->printf(CvTS::LOG, "Bad descriptors accuracy.\n");
             ts->set_failed_test_info( CvTS::FAIL_BAD_ACCURACY );
             return;
         }