Implemented async calls.
This commit is contained in:
parent
7783206934
commit
79ecefb51f
@ -44,6 +44,8 @@
|
||||
|
||||
#include <thrust/device_ptr.h>
|
||||
#include <thrust/sort.h>
|
||||
#include <thrust/system/cuda/execution_policy.h>
|
||||
|
||||
|
||||
#include "opencv2/core/cuda/common.hpp"
|
||||
#include "opencv2/core/cuda/reduce.hpp"
|
||||
@ -56,13 +58,17 @@ namespace cv { namespace cuda { namespace device
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// cull
|
||||
|
||||
int cull_gpu(int* loc, float* response, int size, int n_points)
|
||||
int cull_gpu(int* loc, float* response, int size, int n_points, cudaStream_t stream)
|
||||
{
|
||||
thrust::device_ptr<int> loc_ptr(loc);
|
||||
thrust::device_ptr<float> response_ptr(response);
|
||||
|
||||
thrust::sort_by_key(response_ptr, response_ptr + size, loc_ptr, thrust::greater<float>());
|
||||
|
||||
if(stream)
|
||||
{
|
||||
thrust::sort_by_key(thrust::cuda::par.on(stream), response_ptr, response_ptr + size, loc_ptr, thrust::greater<float>());
|
||||
}else
|
||||
{
|
||||
thrust::sort_by_key(response_ptr, response_ptr + size, loc_ptr, thrust::greater<float>());
|
||||
}
|
||||
return n_points;
|
||||
}
|
||||
|
||||
|
@ -55,7 +55,7 @@ namespace cv { namespace cuda { namespace device
|
||||
{
|
||||
namespace orb
|
||||
{
|
||||
int cull_gpu(int* loc, float* response, int size, int n_points);
|
||||
int cull_gpu(int* loc, float* response, int size, int n_points, cudaStream_t stream);
|
||||
|
||||
void HarrisResponses_gpu(PtrStepSzb img, const short2* loc, float* response, const int npoints, int blockSize, float harris_k, cudaStream_t stream);
|
||||
|
||||
@ -401,10 +401,10 @@ namespace
|
||||
bool blurForDescriptor_;
|
||||
|
||||
private:
|
||||
void buildScalePyramids(InputArray _image, InputArray _mask);
|
||||
void computeKeyPointsPyramid();
|
||||
void computeDescriptors(OutputArray _descriptors);
|
||||
void mergeKeyPoints(OutputArray _keypoints);
|
||||
void buildScalePyramids(InputArray _image, InputArray _mask, Stream& stream);
|
||||
void computeKeyPointsPyramid(Stream& stream);
|
||||
void computeDescriptors(OutputArray _descriptors, Stream& stream);
|
||||
void mergeKeyPoints(OutputArray _keypoints, Stream& stream);
|
||||
|
||||
private:
|
||||
Ptr<cv::cuda::FastFeatureDetector> fastDetector_;
|
||||
@ -582,13 +582,13 @@ namespace
|
||||
{
|
||||
CV_Assert( useProvidedKeypoints == false );
|
||||
|
||||
buildScalePyramids(_image, _mask);
|
||||
computeKeyPointsPyramid();
|
||||
buildScalePyramids(_image, _mask, stream);
|
||||
computeKeyPointsPyramid(stream);
|
||||
if (_descriptors.needed())
|
||||
{
|
||||
computeDescriptors(_descriptors);
|
||||
computeDescriptors(_descriptors, stream);
|
||||
}
|
||||
mergeKeyPoints(_keypoints);
|
||||
mergeKeyPoints(_keypoints, stream);
|
||||
}
|
||||
|
||||
static float getScale(float scaleFactor, int firstLevel, int level)
|
||||
@ -596,7 +596,7 @@ namespace
|
||||
return pow(scaleFactor, level - firstLevel);
|
||||
}
|
||||
|
||||
void ORB_Impl::buildScalePyramids(InputArray _image, InputArray _mask)
|
||||
void ORB_Impl::buildScalePyramids(InputArray _image, InputArray _mask, Stream& stream)
|
||||
{
|
||||
const GpuMat image = _image.getGpuMat();
|
||||
const GpuMat mask = _mask.getGpuMat();
|
||||
@ -622,28 +622,28 @@ namespace
|
||||
{
|
||||
if (level < firstLevel_)
|
||||
{
|
||||
cuda::resize(image, imagePyr_[level], sz, 0, 0, INTER_LINEAR);
|
||||
cuda::resize(image, imagePyr_[level], sz, 0, 0, INTER_LINEAR, stream);
|
||||
|
||||
if (!mask.empty())
|
||||
cuda::resize(mask, maskPyr_[level], sz, 0, 0, INTER_LINEAR);
|
||||
cuda::resize(mask, maskPyr_[level], sz, 0, 0, INTER_LINEAR, stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
cuda::resize(imagePyr_[level - 1], imagePyr_[level], sz, 0, 0, INTER_LINEAR);
|
||||
cuda::resize(imagePyr_[level - 1], imagePyr_[level], sz, 0, 0, INTER_LINEAR, stream);
|
||||
|
||||
if (!mask.empty())
|
||||
{
|
||||
cuda::resize(maskPyr_[level - 1], maskPyr_[level], sz, 0, 0, INTER_LINEAR);
|
||||
cuda::threshold(maskPyr_[level], maskPyr_[level], 254, 0, THRESH_TOZERO);
|
||||
cuda::resize(maskPyr_[level - 1], maskPyr_[level], sz, 0, 0, INTER_LINEAR, stream);
|
||||
cuda::threshold(maskPyr_[level], maskPyr_[level], 254, 0, THRESH_TOZERO, stream);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
image.copyTo(imagePyr_[level]);
|
||||
image.copyTo(imagePyr_[level], stream);
|
||||
|
||||
if (!mask.empty())
|
||||
mask.copyTo(maskPyr_[level]);
|
||||
mask.copyTo(maskPyr_[level], stream);
|
||||
}
|
||||
|
||||
// Filter keypoints by image border
|
||||
@ -652,12 +652,12 @@ namespace
|
||||
Rect inner(edgeThreshold_, edgeThreshold_, sz.width - 2 * edgeThreshold_, sz.height - 2 * edgeThreshold_);
|
||||
buf_(inner).setTo(Scalar::all(255));
|
||||
|
||||
cuda::bitwise_and(maskPyr_[level], buf_, maskPyr_[level]);
|
||||
cuda::bitwise_and(maskPyr_[level], buf_, maskPyr_[level], stream);
|
||||
}
|
||||
}
|
||||
|
||||
// takes keypoints and culls them by the response
|
||||
static void cull(GpuMat& keypoints, int& count, int n_points)
|
||||
static void cull(GpuMat& keypoints, int& count, int n_points, Stream& stream)
|
||||
{
|
||||
using namespace cv::cuda::device::orb;
|
||||
|
||||
@ -670,11 +670,11 @@ namespace
|
||||
return;
|
||||
}
|
||||
|
||||
count = cull_gpu(keypoints.ptr<int>(cuda::FastFeatureDetector::LOCATION_ROW), keypoints.ptr<float>(cuda::FastFeatureDetector::RESPONSE_ROW), count, n_points);
|
||||
count = cull_gpu(keypoints.ptr<int>(cuda::FastFeatureDetector::LOCATION_ROW), keypoints.ptr<float>(cuda::FastFeatureDetector::RESPONSE_ROW), count, n_points, StreamAccessor::getStream(stream));
|
||||
}
|
||||
}
|
||||
|
||||
void ORB_Impl::computeKeyPointsPyramid()
|
||||
void ORB_Impl::computeKeyPointsPyramid(Stream& stream)
|
||||
{
|
||||
using namespace cv::cuda::device::orb;
|
||||
|
||||
@ -690,7 +690,7 @@ namespace
|
||||
fastDetector_->setMaxNumPoints(0.05 * imagePyr_[level].size().area());
|
||||
|
||||
GpuMat fastKpRange;
|
||||
fastDetector_->detectAsync(imagePyr_[level], fastKpRange, maskPyr_[level], Stream::Null());
|
||||
fastDetector_->detectAsync(imagePyr_[level], fastKpRange, maskPyr_[level], stream);
|
||||
|
||||
keyPointsCount_[level] = fastKpRange.cols;
|
||||
|
||||
@ -698,28 +698,28 @@ namespace
|
||||
continue;
|
||||
|
||||
ensureSizeIsEnough(3, keyPointsCount_[level], fastKpRange.type(), keyPointsPyr_[level]);
|
||||
fastKpRange.copyTo(keyPointsPyr_[level].rowRange(0, 2));
|
||||
fastKpRange.copyTo(keyPointsPyr_[level].rowRange(0, 2), stream);
|
||||
|
||||
const int n_features = static_cast<int>(n_features_per_level_[level]);
|
||||
|
||||
if (scoreType_ == ORB::HARRIS_SCORE)
|
||||
{
|
||||
// Keep more points than necessary as FAST does not give amazing corners
|
||||
cull(keyPointsPyr_[level], keyPointsCount_[level], 2 * n_features);
|
||||
cull(keyPointsPyr_[level], keyPointsCount_[level], 2 * n_features, stream);
|
||||
|
||||
// Compute the Harris cornerness (better scoring than FAST)
|
||||
HarrisResponses_gpu(imagePyr_[level], keyPointsPyr_[level].ptr<short2>(0), keyPointsPyr_[level].ptr<float>(1), keyPointsCount_[level], 7, HARRIS_K, 0);
|
||||
HarrisResponses_gpu(imagePyr_[level], keyPointsPyr_[level].ptr<short2>(0), keyPointsPyr_[level].ptr<float>(1), keyPointsCount_[level], 7, HARRIS_K, StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
//cull to the final desired level, using the new Harris scores or the original FAST scores.
|
||||
cull(keyPointsPyr_[level], keyPointsCount_[level], n_features);
|
||||
cull(keyPointsPyr_[level], keyPointsCount_[level], n_features, stream);
|
||||
|
||||
// Compute orientation
|
||||
IC_Angle_gpu(imagePyr_[level], keyPointsPyr_[level].ptr<short2>(0), keyPointsPyr_[level].ptr<float>(2), keyPointsCount_[level], half_patch_size, 0);
|
||||
IC_Angle_gpu(imagePyr_[level], keyPointsPyr_[level].ptr<short2>(0), keyPointsPyr_[level].ptr<float>(2), keyPointsCount_[level], half_patch_size, StreamAccessor::getStream(stream));
|
||||
}
|
||||
}
|
||||
|
||||
void ORB_Impl::computeDescriptors(OutputArray _descriptors)
|
||||
void ORB_Impl::computeDescriptors(OutputArray _descriptors, Stream& stream)
|
||||
{
|
||||
using namespace cv::cuda::device::orb;
|
||||
|
||||
@ -750,17 +750,17 @@ namespace
|
||||
{
|
||||
// preprocess the resized image
|
||||
ensureSizeIsEnough(imagePyr_[level].size(), imagePyr_[level].type(), buf_);
|
||||
blurFilter_->apply(imagePyr_[level], buf_);
|
||||
blurFilter_->apply(imagePyr_[level], buf_, stream);
|
||||
}
|
||||
|
||||
computeOrbDescriptor_gpu(blurForDescriptor_ ? buf_ : imagePyr_[level], keyPointsPyr_[level].ptr<short2>(0), keyPointsPyr_[level].ptr<float>(2),
|
||||
keyPointsCount_[level], pattern_.ptr<int>(0), pattern_.ptr<int>(1), descRange, descriptorSize(), WTA_K_, 0);
|
||||
keyPointsCount_[level], pattern_.ptr<int>(0), pattern_.ptr<int>(1), descRange, descriptorSize(), WTA_K_, StreamAccessor::getStream(stream));
|
||||
|
||||
offset += keyPointsCount_[level];
|
||||
}
|
||||
}
|
||||
|
||||
void ORB_Impl::mergeKeyPoints(OutputArray _keypoints)
|
||||
void ORB_Impl::mergeKeyPoints(OutputArray _keypoints, Stream& stream)
|
||||
{
|
||||
using namespace cv::cuda::device::orb;
|
||||
|
||||
@ -791,10 +791,10 @@ namespace
|
||||
|
||||
float locScale = level != firstLevel_ ? sf : 1.0f;
|
||||
|
||||
mergeLocation_gpu(keyPointsPyr_[level].ptr<short2>(0), keyPointsRange.ptr<float>(0), keyPointsRange.ptr<float>(1), keyPointsCount_[level], locScale, 0);
|
||||
mergeLocation_gpu(keyPointsPyr_[level].ptr<short2>(0), keyPointsRange.ptr<float>(0), keyPointsRange.ptr<float>(1), keyPointsCount_[level], locScale, StreamAccessor::getStream(stream));
|
||||
|
||||
GpuMat range = keyPointsRange.rowRange(2, 4);
|
||||
keyPointsPyr_[level](Range(1, 3), Range(0, keyPointsCount_[level])).copyTo(range);
|
||||
keyPointsPyr_[level](Range(1, 3), Range(0, keyPointsCount_[level])).copyTo(range, stream);
|
||||
|
||||
keyPointsRange.row(4).setTo(Scalar::all(level));
|
||||
keyPointsRange.row(5).setTo(Scalar::all(patchSize_ * sf));
|
||||
|
Loading…
Reference in New Issue
Block a user