refactor CUDA FAST feature detector algorithm:
use new FastFeatureDetector abstract interface and hidden implementation
This commit is contained in:
@@ -279,7 +279,7 @@ namespace cv { namespace cuda { namespace device
|
||||
#endif
|
||||
}
|
||||
|
||||
int calcKeypoints_gpu(PtrStepSzb img, PtrStepSzb mask, short2* kpLoc, int maxKeypoints, PtrStepSzi score, int threshold)
|
||||
int calcKeypoints_gpu(PtrStepSzb img, PtrStepSzb mask, short2* kpLoc, int maxKeypoints, PtrStepSzi score, int threshold, cudaStream_t stream)
|
||||
{
|
||||
void* counter_ptr;
|
||||
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
|
||||
@@ -290,29 +290,29 @@ namespace cv { namespace cuda { namespace device
|
||||
grid.x = divUp(img.cols - 6, block.x);
|
||||
grid.y = divUp(img.rows - 6, block.y);
|
||||
|
||||
cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) );
|
||||
cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(unsigned int), stream) );
|
||||
|
||||
if (score.data)
|
||||
{
|
||||
if (mask.data)
|
||||
calcKeypoints<true><<<grid, block>>>(img, SingleMask(mask), kpLoc, maxKeypoints, score, threshold);
|
||||
calcKeypoints<true><<<grid, block, 0, stream>>>(img, SingleMask(mask), kpLoc, maxKeypoints, score, threshold);
|
||||
else
|
||||
calcKeypoints<true><<<grid, block>>>(img, WithOutMask(), kpLoc, maxKeypoints, score, threshold);
|
||||
calcKeypoints<true><<<grid, block, 0, stream>>>(img, WithOutMask(), kpLoc, maxKeypoints, score, threshold);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (mask.data)
|
||||
calcKeypoints<false><<<grid, block>>>(img, SingleMask(mask), kpLoc, maxKeypoints, score, threshold);
|
||||
calcKeypoints<false><<<grid, block, 0, stream>>>(img, SingleMask(mask), kpLoc, maxKeypoints, score, threshold);
|
||||
else
|
||||
calcKeypoints<false><<<grid, block>>>(img, WithOutMask(), kpLoc, maxKeypoints, score, threshold);
|
||||
calcKeypoints<false><<<grid, block, 0, stream>>>(img, WithOutMask(), kpLoc, maxKeypoints, score, threshold);
|
||||
}
|
||||
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
|
||||
unsigned int count;
|
||||
cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
|
||||
cudaSafeCall( cudaMemcpyAsync(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost, stream) );
|
||||
|
||||
cudaSafeCall( cudaStreamSynchronize(stream) );
|
||||
|
||||
return count;
|
||||
}
|
||||
@@ -356,7 +356,7 @@ namespace cv { namespace cuda { namespace device
|
||||
#endif
|
||||
}
|
||||
|
||||
int nonmaxSuppression_gpu(const short2* kpLoc, int count, PtrStepSzi score, short2* loc, float* response)
|
||||
int nonmaxSuppression_gpu(const short2* kpLoc, int count, PtrStepSzi score, short2* loc, float* response, cudaStream_t stream)
|
||||
{
|
||||
void* counter_ptr;
|
||||
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
|
||||
@@ -366,15 +366,15 @@ namespace cv { namespace cuda { namespace device
|
||||
dim3 grid;
|
||||
grid.x = divUp(count, block.x);
|
||||
|
||||
cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) );
|
||||
cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(unsigned int), stream) );
|
||||
|
||||
nonmaxSuppression<<<grid, block>>>(kpLoc, count, score, loc, response);
|
||||
nonmaxSuppression<<<grid, block, 0, stream>>>(kpLoc, count, score, loc, response);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
|
||||
unsigned int new_count;
|
||||
cudaSafeCall( cudaMemcpy(&new_count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
|
||||
cudaSafeCall( cudaMemcpyAsync(&new_count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost, stream) );
|
||||
|
||||
cudaSafeCall( cudaStreamSynchronize(stream) );
|
||||
|
||||
return new_count;
|
||||
}
|
||||
|
||||
@@ -47,124 +47,162 @@ using namespace cv::cuda;
|
||||
|
||||
#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
|
||||
|
||||
cv::cuda::FAST_CUDA::FAST_CUDA(int, bool, double) { throw_no_cuda(); }
|
||||
void cv::cuda::FAST_CUDA::operator ()(const GpuMat&, const GpuMat&, GpuMat&) { throw_no_cuda(); }
|
||||
void cv::cuda::FAST_CUDA::operator ()(const GpuMat&, const GpuMat&, std::vector<KeyPoint>&) { throw_no_cuda(); }
|
||||
void cv::cuda::FAST_CUDA::downloadKeypoints(const GpuMat&, std::vector<KeyPoint>&) { throw_no_cuda(); }
|
||||
void cv::cuda::FAST_CUDA::convertKeypoints(const Mat&, std::vector<KeyPoint>&) { throw_no_cuda(); }
|
||||
void cv::cuda::FAST_CUDA::release() { throw_no_cuda(); }
|
||||
int cv::cuda::FAST_CUDA::calcKeyPointsLocation(const GpuMat&, const GpuMat&) { throw_no_cuda(); return 0; }
|
||||
int cv::cuda::FAST_CUDA::getKeyPoints(GpuMat&) { throw_no_cuda(); return 0; }
|
||||
Ptr<FastFeatureDetector> cv::cuda::FastFeatureDetector::create(int, bool, int, int) { throw_no_cuda(); return Ptr<FastFeatureDetector>(); }
|
||||
|
||||
#else /* !defined (HAVE_CUDA) */
|
||||
|
||||
cv::cuda::FAST_CUDA::FAST_CUDA(int _threshold, bool _nonmaxSuppression, double _keypointsRatio) :
|
||||
nonmaxSuppression(_nonmaxSuppression), threshold(_threshold), keypointsRatio(_keypointsRatio), count_(0)
|
||||
{
|
||||
}
|
||||
|
||||
void cv::cuda::FAST_CUDA::operator ()(const GpuMat& image, const GpuMat& mask, std::vector<KeyPoint>& keypoints)
|
||||
{
|
||||
if (image.empty())
|
||||
return;
|
||||
|
||||
(*this)(image, mask, d_keypoints_);
|
||||
downloadKeypoints(d_keypoints_, keypoints);
|
||||
}
|
||||
|
||||
void cv::cuda::FAST_CUDA::downloadKeypoints(const GpuMat& d_keypoints, std::vector<KeyPoint>& keypoints)
|
||||
{
|
||||
if (d_keypoints.empty())
|
||||
return;
|
||||
|
||||
Mat h_keypoints(d_keypoints);
|
||||
convertKeypoints(h_keypoints, keypoints);
|
||||
}
|
||||
|
||||
void cv::cuda::FAST_CUDA::convertKeypoints(const Mat& h_keypoints, std::vector<KeyPoint>& keypoints)
|
||||
{
|
||||
if (h_keypoints.empty())
|
||||
return;
|
||||
|
||||
CV_Assert(h_keypoints.rows == ROWS_COUNT && h_keypoints.elemSize() == 4);
|
||||
|
||||
int npoints = h_keypoints.cols;
|
||||
|
||||
keypoints.resize(npoints);
|
||||
|
||||
const short2* loc_row = h_keypoints.ptr<short2>(LOCATION_ROW);
|
||||
const float* response_row = h_keypoints.ptr<float>(RESPONSE_ROW);
|
||||
|
||||
for (int i = 0; i < npoints; ++i)
|
||||
{
|
||||
KeyPoint kp(loc_row[i].x, loc_row[i].y, static_cast<float>(FEATURE_SIZE), -1, response_row[i]);
|
||||
keypoints[i] = kp;
|
||||
}
|
||||
}
|
||||
|
||||
void cv::cuda::FAST_CUDA::operator ()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints)
|
||||
{
|
||||
calcKeyPointsLocation(img, mask);
|
||||
keypoints.cols = getKeyPoints(keypoints);
|
||||
}
|
||||
|
||||
namespace cv { namespace cuda { namespace device
|
||||
{
|
||||
namespace fast
|
||||
{
|
||||
int calcKeypoints_gpu(PtrStepSzb img, PtrStepSzb mask, short2* kpLoc, int maxKeypoints, PtrStepSzi score, int threshold);
|
||||
int nonmaxSuppression_gpu(const short2* kpLoc, int count, PtrStepSzi score, short2* loc, float* response);
|
||||
int calcKeypoints_gpu(PtrStepSzb img, PtrStepSzb mask, short2* kpLoc, int maxKeypoints, PtrStepSzi score, int threshold, cudaStream_t stream);
|
||||
int nonmaxSuppression_gpu(const short2* kpLoc, int count, PtrStepSzi score, short2* loc, float* response, cudaStream_t stream);
|
||||
}
|
||||
}}}
|
||||
|
||||
int cv::cuda::FAST_CUDA::calcKeyPointsLocation(const GpuMat& img, const GpuMat& mask)
|
||||
namespace
|
||||
{
|
||||
using namespace cv::cuda::device::fast;
|
||||
|
||||
CV_Assert(img.type() == CV_8UC1);
|
||||
CV_Assert(mask.empty() || (mask.type() == CV_8UC1 && mask.size() == img.size()));
|
||||
|
||||
int maxKeypoints = static_cast<int>(keypointsRatio * img.size().area());
|
||||
|
||||
ensureSizeIsEnough(1, maxKeypoints, CV_16SC2, kpLoc_);
|
||||
|
||||
if (nonmaxSuppression)
|
||||
class FAST_Impl : public cv::cuda::FastFeatureDetector
|
||||
{
|
||||
public:
|
||||
FAST_Impl(int threshold, bool nonmaxSuppression, int max_npoints);
|
||||
|
||||
virtual void detect(InputArray _image, std::vector<KeyPoint>& keypoints, InputArray _mask);
|
||||
virtual void detectAsync(InputArray _image, OutputArray _keypoints, InputArray _mask, Stream& stream);
|
||||
|
||||
virtual void convert(InputArray _gpu_keypoints, std::vector<KeyPoint>& keypoints);
|
||||
|
||||
virtual void setThreshold(int threshold) { threshold_ = threshold; }
|
||||
virtual int getThreshold() const { return threshold_; }
|
||||
|
||||
virtual void setNonmaxSuppression(bool f) { nonmaxSuppression_ = f; }
|
||||
virtual bool getNonmaxSuppression() const { return nonmaxSuppression_; }
|
||||
|
||||
virtual void setMaxNumPoints(int max_npoints) { max_npoints_ = max_npoints; }
|
||||
virtual int getMaxNumPoints() const { return max_npoints_; }
|
||||
|
||||
virtual void setType(int type) { CV_Assert( type == TYPE_9_16 ); }
|
||||
virtual int getType() const { return TYPE_9_16; }
|
||||
|
||||
private:
|
||||
int threshold_;
|
||||
bool nonmaxSuppression_;
|
||||
int max_npoints_;
|
||||
};
|
||||
|
||||
FAST_Impl::FAST_Impl(int threshold, bool nonmaxSuppression, int max_npoints) :
|
||||
threshold_(threshold), nonmaxSuppression_(nonmaxSuppression), max_npoints_(max_npoints)
|
||||
{
|
||||
ensureSizeIsEnough(img.size(), CV_32SC1, score_);
|
||||
score_.setTo(Scalar::all(0));
|
||||
}
|
||||
|
||||
count_ = calcKeypoints_gpu(img, mask, kpLoc_.ptr<short2>(), maxKeypoints, nonmaxSuppression ? score_ : PtrStepSzi(), threshold);
|
||||
count_ = std::min(count_, maxKeypoints);
|
||||
void FAST_Impl::detect(InputArray _image, std::vector<KeyPoint>& keypoints, InputArray _mask)
|
||||
{
|
||||
if (_image.empty())
|
||||
{
|
||||
keypoints.clear();
|
||||
return;
|
||||
}
|
||||
|
||||
return count_;
|
||||
BufferPool pool(Stream::Null());
|
||||
GpuMat d_keypoints = pool.getBuffer(ROWS_COUNT, max_npoints_, CV_16SC2);
|
||||
|
||||
detectAsync(_image, d_keypoints, _mask, Stream::Null());
|
||||
convert(d_keypoints, keypoints);
|
||||
}
|
||||
|
||||
void FAST_Impl::detectAsync(InputArray _image, OutputArray _keypoints, InputArray _mask, Stream& stream)
|
||||
{
|
||||
using namespace cv::cuda::device::fast;
|
||||
|
||||
const GpuMat img = _image.getGpuMat();
|
||||
const GpuMat mask = _mask.getGpuMat();
|
||||
|
||||
CV_Assert( img.type() == CV_8UC1 );
|
||||
CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == img.size()) );
|
||||
|
||||
BufferPool pool(stream);
|
||||
|
||||
GpuMat kpLoc = pool.getBuffer(1, max_npoints_, CV_16SC2);
|
||||
|
||||
GpuMat score;
|
||||
if (nonmaxSuppression_)
|
||||
{
|
||||
score = pool.getBuffer(img.size(), CV_32SC1);
|
||||
score.setTo(Scalar::all(0), stream);
|
||||
}
|
||||
|
||||
int count = calcKeypoints_gpu(img, mask, kpLoc.ptr<short2>(), max_npoints_, score, threshold_, StreamAccessor::getStream(stream));
|
||||
count = std::min(count, max_npoints_);
|
||||
|
||||
if (count == 0)
|
||||
{
|
||||
_keypoints.release();
|
||||
return;
|
||||
}
|
||||
|
||||
ensureSizeIsEnough(ROWS_COUNT, count, CV_32FC1, _keypoints);
|
||||
GpuMat& keypoints = _keypoints.getGpuMatRef();
|
||||
|
||||
if (nonmaxSuppression_)
|
||||
{
|
||||
count = nonmaxSuppression_gpu(kpLoc.ptr<short2>(), count, score, keypoints.ptr<short2>(LOCATION_ROW), keypoints.ptr<float>(RESPONSE_ROW), StreamAccessor::getStream(stream));
|
||||
if (count == 0)
|
||||
{
|
||||
keypoints.release();
|
||||
}
|
||||
else
|
||||
{
|
||||
keypoints.cols = count;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
GpuMat locRow(1, count, kpLoc.type(), keypoints.ptr(0));
|
||||
kpLoc.colRange(0, count).copyTo(locRow, stream);
|
||||
keypoints.row(1).setTo(Scalar::all(0), stream);
|
||||
}
|
||||
}
|
||||
|
||||
void FAST_Impl::convert(InputArray _gpu_keypoints, std::vector<KeyPoint>& keypoints)
|
||||
{
|
||||
if (_gpu_keypoints.empty())
|
||||
{
|
||||
keypoints.clear();
|
||||
return;
|
||||
}
|
||||
|
||||
Mat h_keypoints;
|
||||
if (_gpu_keypoints.kind() == _InputArray::CUDA_GPU_MAT)
|
||||
{
|
||||
_gpu_keypoints.getGpuMat().download(h_keypoints);
|
||||
}
|
||||
else
|
||||
{
|
||||
h_keypoints = _gpu_keypoints.getMat();
|
||||
}
|
||||
|
||||
CV_Assert( h_keypoints.rows == ROWS_COUNT );
|
||||
CV_Assert( h_keypoints.elemSize() == 4 );
|
||||
|
||||
const int npoints = h_keypoints.cols;
|
||||
|
||||
keypoints.resize(npoints);
|
||||
|
||||
const short2* loc_row = h_keypoints.ptr<short2>(LOCATION_ROW);
|
||||
const float* response_row = h_keypoints.ptr<float>(RESPONSE_ROW);
|
||||
|
||||
for (int i = 0; i < npoints; ++i)
|
||||
{
|
||||
KeyPoint kp(loc_row[i].x, loc_row[i].y, static_cast<float>(FEATURE_SIZE), -1, response_row[i]);
|
||||
keypoints[i] = kp;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int cv::cuda::FAST_CUDA::getKeyPoints(GpuMat& keypoints)
|
||||
Ptr<cv::cuda::FastFeatureDetector> cv::cuda::FastFeatureDetector::create(int threshold, bool nonmaxSuppression, int type, int max_npoints)
|
||||
{
|
||||
using namespace cv::cuda::device::fast;
|
||||
|
||||
if (count_ == 0)
|
||||
return 0;
|
||||
|
||||
ensureSizeIsEnough(ROWS_COUNT, count_, CV_32FC1, keypoints);
|
||||
|
||||
if (nonmaxSuppression)
|
||||
return nonmaxSuppression_gpu(kpLoc_.ptr<short2>(), count_, score_, keypoints.ptr<short2>(LOCATION_ROW), keypoints.ptr<float>(RESPONSE_ROW));
|
||||
|
||||
GpuMat locRow(1, count_, kpLoc_.type(), keypoints.ptr(0));
|
||||
kpLoc_.colRange(0, count_).copyTo(locRow);
|
||||
keypoints.row(1).setTo(Scalar::all(0));
|
||||
|
||||
return count_;
|
||||
}
|
||||
|
||||
void cv::cuda::FAST_CUDA::release()
|
||||
{
|
||||
kpLoc_.release();
|
||||
score_.release();
|
||||
|
||||
d_keypoints_.release();
|
||||
CV_Assert( type == TYPE_9_16 );
|
||||
return makePtr<FAST_Impl>(threshold, nonmaxSuppression, max_npoints);
|
||||
}
|
||||
|
||||
#endif /* !defined (HAVE_CUDA) */
|
||||
|
||||
@@ -398,7 +398,7 @@ namespace
|
||||
cv::cuda::ORB_CUDA::ORB_CUDA(int nFeatures, float scaleFactor, int nLevels, int edgeThreshold, int firstLevel, int WTA_K, int scoreType, int patchSize) :
|
||||
nFeatures_(nFeatures), scaleFactor_(scaleFactor), nLevels_(nLevels), edgeThreshold_(edgeThreshold), firstLevel_(firstLevel), WTA_K_(WTA_K),
|
||||
scoreType_(scoreType), patchSize_(patchSize),
|
||||
fastDetector_(DEFAULT_FAST_THRESHOLD)
|
||||
fastDetector_(cuda::FastFeatureDetector::create(DEFAULT_FAST_THRESHOLD))
|
||||
{
|
||||
CV_Assert(patchSize_ >= 2);
|
||||
|
||||
@@ -554,7 +554,7 @@ namespace
|
||||
return;
|
||||
}
|
||||
|
||||
count = cull_gpu(keypoints.ptr<int>(FAST_CUDA::LOCATION_ROW), keypoints.ptr<float>(FAST_CUDA::RESPONSE_ROW), count, n_points);
|
||||
count = cull_gpu(keypoints.ptr<int>(cuda::FastFeatureDetector::LOCATION_ROW), keypoints.ptr<float>(cuda::FastFeatureDetector::RESPONSE_ROW), count, n_points);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -570,20 +570,20 @@ void cv::cuda::ORB_CUDA::computeKeyPointsPyramid()
|
||||
|
||||
for (int level = 0; level < nLevels_; ++level)
|
||||
{
|
||||
keyPointsCount_[level] = fastDetector_.calcKeyPointsLocation(imagePyr_[level], maskPyr_[level]);
|
||||
fastDetector_->setMaxNumPoints(0.05 * imagePyr_[level].size().area());
|
||||
|
||||
GpuMat fastKpRange;
|
||||
fastDetector_->detectAsync(imagePyr_[level], fastKpRange, maskPyr_[level], Stream::Null());
|
||||
|
||||
keyPointsCount_[level] = fastKpRange.cols;
|
||||
|
||||
if (keyPointsCount_[level] == 0)
|
||||
continue;
|
||||
|
||||
ensureSizeIsEnough(3, keyPointsCount_[level], CV_32FC1, keyPointsPyr_[level]);
|
||||
ensureSizeIsEnough(3, keyPointsCount_[level], fastKpRange.type(), keyPointsPyr_[level]);
|
||||
fastKpRange.copyTo(keyPointsPyr_[level].rowRange(0, 2));
|
||||
|
||||
GpuMat fastKpRange = keyPointsPyr_[level].rowRange(0, 2);
|
||||
keyPointsCount_[level] = fastDetector_.getKeyPoints(fastKpRange);
|
||||
|
||||
if (keyPointsCount_[level] == 0)
|
||||
continue;
|
||||
|
||||
int n_features = static_cast<int>(n_features_per_level_[level]);
|
||||
const int n_features = static_cast<int>(n_features_per_level_[level]);
|
||||
|
||||
if (scoreType_ == ORB::HARRIS_SCORE)
|
||||
{
|
||||
@@ -767,8 +767,6 @@ void cv::cuda::ORB_CUDA::release()
|
||||
|
||||
keyPointsPyr_.clear();
|
||||
|
||||
fastDetector_.release();
|
||||
|
||||
d_keypoints_.release();
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user