Thrust allocator usage.
This commit is contained in:
parent
7a934f9ee0
commit
a21ede9452
@ -51,7 +51,7 @@
|
||||
#include "opencv2/core/cuda/common.hpp"
|
||||
#include "opencv2/core/cuda/reduce.hpp"
|
||||
#include "opencv2/core/cuda/functional.hpp"
|
||||
|
||||
#include "opencv2/core/cuda/utility.hpp"
|
||||
namespace cv { namespace cuda { namespace device
|
||||
{
|
||||
namespace orb
|
||||
@ -64,6 +64,16 @@ namespace cv { namespace cuda { namespace device
|
||||
thrust::device_ptr<int> loc_ptr(loc);
|
||||
thrust::device_ptr<float> response_ptr(response);
|
||||
#if THRUST_VERSION >= 100800
|
||||
#if THRUST_VERSION >= 100802
|
||||
if (stream)
|
||||
{
|
||||
thrust::sort_by_key(thrust::cuda::par(ThrustAllocator::getAllocator()).on(stream), response_ptr, response_ptr + size, loc_ptr, thrust::greater<float>());
|
||||
}
|
||||
else
|
||||
{
|
||||
thrust::sort_by_key(thrust::cuda::par(ThrustAllocator::getAllocator()), response_ptr, response_ptr + size, loc_ptr, thrust::greater<float>());
|
||||
}
|
||||
#else
|
||||
if(stream)
|
||||
{
|
||||
thrust::sort_by_key(thrust::cuda::par.on(stream), response_ptr, response_ptr + size, loc_ptr, thrust::greater<float>());
|
||||
@ -71,6 +81,7 @@ namespace cv { namespace cuda { namespace device
|
||||
{
|
||||
thrust::sort_by_key(response_ptr, response_ptr + size, loc_ptr, thrust::greater<float>());
|
||||
}
|
||||
#endif
|
||||
#else
|
||||
thrust::sort_by_key(response_ptr, response_ptr + size, loc_ptr, thrust::greater<float>());
|
||||
#endif
|
||||
|
@ -47,7 +47,7 @@
|
||||
|
||||
#include "opencv2/core/cuda/common.hpp"
|
||||
#include "opencv2/core/cuda/utility.hpp"
|
||||
|
||||
#include <thrust/execution_policy.h>
|
||||
namespace cv { namespace cuda { namespace device
|
||||
{
|
||||
namespace gfft
|
||||
@ -91,12 +91,12 @@ namespace cv { namespace cuda { namespace device
|
||||
}
|
||||
}
|
||||
|
||||
int findCorners_gpu(PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count)
|
||||
int findCorners_gpu(PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count, cudaStream_t stream)
|
||||
{
|
||||
void* counter_ptr;
|
||||
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
|
||||
|
||||
cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
|
||||
cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(int), stream) );
|
||||
|
||||
bindTexture(&eigTex, eig);
|
||||
|
||||
@ -104,17 +104,18 @@ namespace cv { namespace cuda { namespace device
|
||||
dim3 grid(divUp(eig.cols, block.x), divUp(eig.rows, block.y));
|
||||
|
||||
if (mask.data)
|
||||
findCorners<<<grid, block>>>(threshold, SingleMask(mask), corners, max_count, eig.rows, eig.cols);
|
||||
findCorners<<<grid, block, 0, stream>>>(threshold, SingleMask(mask), corners, max_count, eig.rows, eig.cols);
|
||||
else
|
||||
findCorners<<<grid, block>>>(threshold, WithOutMask(), corners, max_count, eig.rows, eig.cols);
|
||||
findCorners<<<grid, block, 0, stream>>>(threshold, WithOutMask(), corners, max_count, eig.rows, eig.cols);
|
||||
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
|
||||
int count;
|
||||
cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
|
||||
|
||||
cudaSafeCall( cudaMemcpyAsync(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost, stream) );
|
||||
if (stream)
|
||||
cudaSafeCall(cudaStreamSynchronize(stream));
|
||||
else
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
return std::min(count, max_count);
|
||||
}
|
||||
|
||||
@ -128,13 +129,19 @@ namespace cv { namespace cuda { namespace device
|
||||
};
|
||||
|
||||
|
||||
void sortCorners_gpu(PtrStepSzf eig, float2* corners, int count)
|
||||
void sortCorners_gpu(PtrStepSzf eig, float2* corners, int count, cudaStream_t stream)
|
||||
{
|
||||
bindTexture(&eigTex, eig);
|
||||
|
||||
thrust::device_ptr<float2> ptr(corners);
|
||||
|
||||
#if THRUST_VERSION >= 100802
|
||||
if (stream)
|
||||
thrust::sort(thrust::cuda::par(ThrustAllocator::getAllocator()).on(stream), ptr, ptr + count, EigGreater());
|
||||
else
|
||||
thrust::sort(thrust::cuda::par(ThrustAllocator::getAllocator()), ptr, ptr + count, EigGreater());
|
||||
#else
|
||||
thrust::sort(ptr, ptr + count, EigGreater());
|
||||
#endif
|
||||
}
|
||||
} // namespace optical_flow
|
||||
}}}
|
||||
|
@ -55,8 +55,8 @@ namespace cv { namespace cuda { namespace device
|
||||
{
|
||||
namespace gfft
|
||||
{
|
||||
int findCorners_gpu(PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count);
|
||||
void sortCorners_gpu(PtrStepSzf eig, float2* corners, int count);
|
||||
int findCorners_gpu(PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count, cudaStream_t stream);
|
||||
void sortCorners_gpu(PtrStepSzf eig, float2* corners, int count, cudaStream_t stream);
|
||||
}
|
||||
}}}
|
||||
|
||||
@ -97,9 +97,6 @@ namespace
|
||||
|
||||
void GoodFeaturesToTrackDetector::detect(InputArray _image, OutputArray _corners, InputArray _mask, Stream& stream)
|
||||
{
|
||||
// TODO : implement async version
|
||||
(void) stream;
|
||||
|
||||
using namespace cv::cuda::device::gfft;
|
||||
|
||||
GpuMat image = _image.getGpuMat();
|
||||
@ -108,14 +105,14 @@ namespace
|
||||
CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == image.size()) );
|
||||
|
||||
ensureSizeIsEnough(image.size(), CV_32FC1, eig_);
|
||||
cornerCriteria_->compute(image, eig_);
|
||||
cornerCriteria_->compute(image, eig_, stream);
|
||||
|
||||
double maxVal = 0;
|
||||
cuda::minMax(eig_, 0, &maxVal);
|
||||
|
||||
cudaStream_t stream_ = StreamAccessor::getStream(stream);
|
||||
ensureSizeIsEnough(1, std::max(1000, static_cast<int>(image.size().area() * 0.05)), CV_32FC2, tmpCorners_);
|
||||
|
||||
int total = findCorners_gpu(eig_, static_cast<float>(maxVal * qualityLevel_), mask, tmpCorners_.ptr<float2>(), tmpCorners_.cols);
|
||||
int total = findCorners_gpu(eig_, static_cast<float>(maxVal * qualityLevel_), mask, tmpCorners_.ptr<float2>(), tmpCorners_.cols, stream_);
|
||||
|
||||
if (total == 0)
|
||||
{
|
||||
@ -123,18 +120,18 @@ namespace
|
||||
return;
|
||||
}
|
||||
|
||||
sortCorners_gpu(eig_, tmpCorners_.ptr<float2>(), total);
|
||||
sortCorners_gpu(eig_, tmpCorners_.ptr<float2>(), total, stream_);
|
||||
|
||||
if (minDistance_ < 1)
|
||||
{
|
||||
tmpCorners_.colRange(0, maxCorners_ > 0 ? std::min(maxCorners_, total) : total).copyTo(_corners);
|
||||
tmpCorners_.colRange(0, maxCorners_ > 0 ? std::min(maxCorners_, total) : total).copyTo(_corners, stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
std::vector<Point2f> tmp(total);
|
||||
Mat tmpMat(1, total, CV_32FC2, (void*)&tmp[0]);
|
||||
tmpCorners_.colRange(0, total).download(tmpMat);
|
||||
|
||||
tmpCorners_.colRange(0, total).download(tmpMat, stream);
|
||||
stream.waitForCompletion();
|
||||
std::vector<Point2f> tmp2;
|
||||
tmp2.reserve(total);
|
||||
|
||||
@ -203,7 +200,7 @@ namespace
|
||||
_corners.create(1, static_cast<int>(tmp2.size()), CV_32FC2);
|
||||
GpuMat corners = _corners.getGpuMat();
|
||||
|
||||
corners.upload(Mat(1, static_cast<int>(tmp2.size()), CV_32FC2, &tmp2[0]));
|
||||
corners.upload(Mat(1, static_cast<int>(tmp2.size()), CV_32FC2, &tmp2[0]), stream);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user