From 924670d32c5bb3cebc83dc582067b95a597e8fcf Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Wed, 9 Feb 2011 09:11:11 +0000 Subject: [PATCH] fixed block size calculation in SURF_GPU (fasthessian_gpu and nonmaxonly_gpu kernels) --- modules/gpu/src/cuda/surf.cu | 40 ++++++++++++++------------- modules/gpu/src/surf.cpp | 15 ++++++---- samples/gpu/surf_keypoint_matcher.cpp | 5 ++++ tests/gpu/src/features2d.cpp | 2 ++ 4 files changed, 37 insertions(+), 25 deletions(-) diff --git a/modules/gpu/src/cuda/surf.cu b/modules/gpu/src/cuda/surf.cu index 00f62d475..cb47f2822 100644 --- a/modules/gpu/src/cuda/surf.cu +++ b/modules/gpu/src/cuda/surf.cu @@ -237,20 +237,31 @@ namespace cv { namespace gpu { namespace surf hessianBuffer.ptr(c_y_size * hidx_z + hidx_y)[hidx_x] = result; } - } - - void fasthessian_gpu(PtrStepf hessianBuffer, int nIntervals, int x_size, int y_size) + } + + dim3 calcBlockSize(int nIntervals) { - dim3 threads; - threads.x = 16; - threads.y = 8; + int threadsPerBlock = 512; + + dim3 threads; threads.z = nIntervals; + threadsPerBlock /= nIntervals; + if (threadsPerBlock >= 48) + threads.x = 16; + else + threads.x = 8; + threadsPerBlock /= threads.x; + threads.y = threadsPerBlock; + + return threads; + } + void fasthessian_gpu(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threads) + { dim3 grid; grid.x = divUp(x_size, threads.x); grid.y = divUp(y_size, threads.y); - grid.z = 1; - + fasthessian<<>>(hessianBuffer); cudaSafeCall( cudaThreadSynchronize() ); @@ -370,17 +381,11 @@ namespace cv { namespace gpu { namespace surf } void nonmaxonly_gpu(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int& maxCounter, - int nIntervals, int x_size, int y_size, bool use_mask) + int x_size, int y_size, bool use_mask, const dim3& threads) { - dim3 threads; - threads.x = 16; - threads.y = 8; - threads.z = nIntervals; - dim3 grid; grid.x = divUp(x_size, threads.x - 2); grid.y = divUp(y_size, threads.y - 2); - grid.z = 1; const size_t smem_size = threads.x * threads.y * threads.z * sizeof(float); @@ -565,8 +570,6 @@ namespace cv { namespace gpu { namespace surf dim3 grid; grid.x = maxCounter; - grid.y = 1; - grid.z = 1; DeviceReference featureCounterWrapper(featureCounter); @@ -624,6 +627,7 @@ namespace cv { namespace gpu { namespace surf // - SURF says to only use a circle, but the branching logic would slow it down // - Gaussian weighting should reduce the effects of the outer points anyway if (tid2 < 169) + { dx -= texLookups[threadIdx.x ][threadIdx.y ]; dx += 2.f*texLookups[threadIdx.x + 2][threadIdx.y ]; @@ -709,8 +713,6 @@ namespace cv { namespace gpu { namespace surf dim3 grid; grid.x = nFeatures; - grid.y = 1; - grid.z = 1; find_orientation<<>>(features); cudaSafeCall( cudaThreadSynchronize() ); diff --git a/modules/gpu/src/surf.cpp b/modules/gpu/src/surf.cpp index b5ab0d902..145ca4992 100644 --- a/modules/gpu/src/surf.cpp +++ b/modules/gpu/src/surf.cpp @@ -61,11 +61,13 @@ void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, vector 0 && nIntervals > 2); + CV_Assert(nOctaves > 0 && nIntervals > 2 && nIntervals < 22); CV_Assert(DeviceInfo().has(ATOMICS)); max_features = static_cast(img.size().area() * featuresRatio); @@ -168,6 +170,7 @@ namespace void detectKeypoints(GpuMat& keypoints) { + dim3 threads = calcBlockSize(nIntervals); for(int octave = 0; octave < nOctaves; ++octave) { int step = initialStep * (1 << octave); @@ -189,12 +192,12 @@ namespace uploadConstant("cv::gpu::surf::c_border", border); uploadConstant("cv::gpu::surf::c_step", step); - fasthessian_gpu(hessianBuffer, nIntervals, x_size, y_size); + fasthessian_gpu(hessianBuffer, x_size, y_size, threads); // Reset the candidate count. maxCounter = 0; - nonmaxonly_gpu(hessianBuffer, maxPosBuffer.ptr(), maxCounter, nIntervals, x_size, y_size, use_mask); + nonmaxonly_gpu(hessianBuffer, maxPosBuffer.ptr(), maxCounter, x_size, y_size, use_mask, threads); maxCounter = std::min(maxCounter, static_cast(max_candidates)); diff --git a/samples/gpu/surf_keypoint_matcher.cpp b/samples/gpu/surf_keypoint_matcher.cpp index c109ee70f..b2c93852e 100644 --- a/samples/gpu/surf_keypoint_matcher.cpp +++ b/samples/gpu/surf_keypoint_matcher.cpp @@ -38,6 +38,9 @@ int main(int argc, char* argv[]) GpuMat descriptors1GPU, descriptors2GPU; surf(img1, GpuMat(), keypoints1GPU, descriptors1GPU); surf(img2, GpuMat(), keypoints2GPU, descriptors2GPU); + + cout << "FOUND " << keypoints1GPU.cols << " keypoints on first image" << endl; + cout << "FOUND " << keypoints2GPU.cols << " keypoints on second image" << endl; // matching descriptors BruteForceMatcher_GPU< L2 > matcher; @@ -57,6 +60,8 @@ int main(int argc, char* argv[]) // drawing the results Mat img_matches; drawMatches(img1, keypoints1, img2, keypoints2, matches, img_matches); + + namedWindow("matches", 0); imshow("matches", img_matches); waitKey(0); diff --git a/tests/gpu/src/features2d.cpp b/tests/gpu/src/features2d.cpp index 8a754478e..99fb28d4c 100644 --- a/tests/gpu/src/features2d.cpp +++ b/tests/gpu/src/features2d.cpp @@ -149,12 +149,14 @@ void CV_GPU_SURFTest::compareKeypointSets(const vector& validKeypoints assert(minDist >= 0); if (!isSimilarKeypoints(validKeypoints[v], calcKeypoints[nearestIdx])) { + ts->printf(CvTS::LOG, "Bad keypoints accuracy.\n"); ts->set_failed_test_info( CvTS::FAIL_BAD_ACCURACY ); return; } if (norm(validDescriptors.row(v), calcDescriptors.row(nearestIdx), NORM_L2) > 1.0f) { + ts->printf(CvTS::LOG, "Bad descriptors accuracy.\n"); ts->set_failed_test_info( CvTS::FAIL_BAD_ACCURACY ); return; }