optimizations:
- new reduce implementation (with kepler optimizations) - saturate_cast via asm command - video SIMD instructions in element operations - float arithmetics instead of double - new deviceSupports function
This commit is contained in:
@@ -88,71 +88,71 @@ namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
|
||||
const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
|
||||
int cc, cudaStream_t stream);
|
||||
cudaStream_t stream);
|
||||
template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
|
||||
const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
|
||||
int cc, cudaStream_t stream);
|
||||
cudaStream_t stream);
|
||||
template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
|
||||
const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
|
||||
int cc, cudaStream_t stream);
|
||||
cudaStream_t stream);
|
||||
|
||||
template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
|
||||
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
|
||||
int cc, cudaStream_t stream);
|
||||
cudaStream_t stream);
|
||||
template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
|
||||
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
|
||||
int cc, cudaStream_t stream);
|
||||
cudaStream_t stream);
|
||||
template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
|
||||
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
|
||||
int cc, cudaStream_t stream);
|
||||
cudaStream_t stream);
|
||||
}
|
||||
|
||||
namespace bf_knnmatch
|
||||
{
|
||||
template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
|
||||
const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
|
||||
int cc, cudaStream_t stream);
|
||||
cudaStream_t stream);
|
||||
template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
|
||||
const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
|
||||
int cc, cudaStream_t stream);
|
||||
cudaStream_t stream);
|
||||
template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
|
||||
const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
|
||||
int cc, cudaStream_t stream);
|
||||
cudaStream_t stream);
|
||||
|
||||
template <typename T> void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
|
||||
const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
|
||||
int cc, cudaStream_t stream);
|
||||
cudaStream_t stream);
|
||||
template <typename T> void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
|
||||
const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
|
||||
int cc, cudaStream_t stream);
|
||||
cudaStream_t stream);
|
||||
template <typename T> void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
|
||||
const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
|
||||
int cc, cudaStream_t stream);
|
||||
cudaStream_t stream);
|
||||
}
|
||||
|
||||
namespace bf_radius_match
|
||||
{
|
||||
template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
|
||||
const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
|
||||
int cc, cudaStream_t stream);
|
||||
cudaStream_t stream);
|
||||
template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
|
||||
const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
|
||||
int cc, cudaStream_t stream);
|
||||
cudaStream_t stream);
|
||||
template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
|
||||
const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
|
||||
int cc, cudaStream_t stream);
|
||||
cudaStream_t stream);
|
||||
|
||||
template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
|
||||
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
|
||||
int cc, cudaStream_t stream);
|
||||
cudaStream_t stream);
|
||||
|
||||
template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
|
||||
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
|
||||
int cc, cudaStream_t stream);
|
||||
cudaStream_t stream);
|
||||
|
||||
template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
|
||||
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
|
||||
int cc, cudaStream_t stream);
|
||||
cudaStream_t stream);
|
||||
}
|
||||
}}}
|
||||
|
||||
@@ -198,11 +198,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& query, const
|
||||
if (query.empty() || train.empty())
|
||||
return;
|
||||
|
||||
using namespace ::cv::gpu::device::bf_match;
|
||||
using namespace cv::gpu::device::bf_match;
|
||||
|
||||
typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
|
||||
const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
|
||||
int cc, cudaStream_t stream);
|
||||
cudaStream_t stream);
|
||||
|
||||
static const caller_t callers[3][6] =
|
||||
{
|
||||
@@ -234,10 +234,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& query, const
|
||||
caller_t func = callers[distType][query.depth()];
|
||||
CV_Assert(func != 0);
|
||||
|
||||
DeviceInfo info;
|
||||
int cc = info.majorVersion() * 10 + info.minorVersion();
|
||||
|
||||
func(query, train, mask, trainIdx, distance, cc, StreamAccessor::getStream(stream));
|
||||
func(query, train, mask, trainIdx, distance, StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& distance, vector<DMatch>& matches)
|
||||
@@ -340,11 +337,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& query, c
|
||||
if (query.empty() || trainCollection.empty())
|
||||
return;
|
||||
|
||||
using namespace ::cv::gpu::device::bf_match;
|
||||
using namespace cv::gpu::device::bf_match;
|
||||
|
||||
typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
|
||||
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
|
||||
int cc, cudaStream_t stream);
|
||||
cudaStream_t stream);
|
||||
|
||||
static const caller_t callers[3][6] =
|
||||
{
|
||||
@@ -376,10 +373,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& query, c
|
||||
caller_t func = callers[distType][query.depth()];
|
||||
CV_Assert(func != 0);
|
||||
|
||||
DeviceInfo info;
|
||||
int cc = info.majorVersion() * 10 + info.minorVersion();
|
||||
|
||||
func(query, trainCollection, masks, trainIdx, imgIdx, distance, cc, StreamAccessor::getStream(stream));
|
||||
func(query, trainCollection, masks, trainIdx, imgIdx, distance, StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, vector<DMatch>& matches)
|
||||
@@ -451,11 +445,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchSingle(const GpuMat& query, co
|
||||
if (query.empty() || train.empty())
|
||||
return;
|
||||
|
||||
using namespace ::cv::gpu::device::bf_knnmatch;
|
||||
using namespace cv::gpu::device::bf_knnmatch;
|
||||
|
||||
typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
|
||||
const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
|
||||
int cc, cudaStream_t stream);
|
||||
cudaStream_t stream);
|
||||
|
||||
static const caller_t callers[3][6] =
|
||||
{
|
||||
@@ -502,10 +496,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchSingle(const GpuMat& query, co
|
||||
caller_t func = callers[distType][query.depth()];
|
||||
CV_Assert(func != 0);
|
||||
|
||||
DeviceInfo info;
|
||||
int cc = info.majorVersion() * 10 + info.minorVersion();
|
||||
|
||||
func(query, train, k, mask, trainIdx, distance, allDist, cc, StreamAccessor::getStream(stream));
|
||||
func(query, train, k, mask, trainIdx, distance, allDist, StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
void cv::gpu::BruteForceMatcher_GPU_base::knnMatchDownload(const GpuMat& trainIdx, const GpuMat& distance,
|
||||
@@ -580,11 +571,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Collection(const GpuMat& quer
|
||||
if (query.empty() || trainCollection.empty())
|
||||
return;
|
||||
|
||||
using namespace ::cv::gpu::device::bf_knnmatch;
|
||||
using namespace cv::gpu::device::bf_knnmatch;
|
||||
|
||||
typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
|
||||
const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
|
||||
int cc, cudaStream_t stream);
|
||||
cudaStream_t stream);
|
||||
|
||||
static const caller_t callers[3][6] =
|
||||
{
|
||||
@@ -621,10 +612,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Collection(const GpuMat& quer
|
||||
caller_t func = callers[distType][query.depth()];
|
||||
CV_Assert(func != 0);
|
||||
|
||||
DeviceInfo info;
|
||||
int cc = info.majorVersion() * 10 + info.minorVersion();
|
||||
|
||||
func(query, trainCollection, maskCollection, trainIdx, imgIdx, distance, cc, StreamAccessor::getStream(stream));
|
||||
func(query, trainCollection, maskCollection, trainIdx, imgIdx, distance, StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Download(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance,
|
||||
@@ -765,7 +753,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchSingle(const GpuMat& query,
|
||||
|
||||
typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
|
||||
const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
|
||||
int cc, cudaStream_t stream);
|
||||
cudaStream_t stream);
|
||||
|
||||
static const caller_t callers[3][6] =
|
||||
{
|
||||
@@ -786,12 +774,6 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchSingle(const GpuMat& query,
|
||||
}
|
||||
};
|
||||
|
||||
DeviceInfo info;
|
||||
int cc = info.majorVersion() * 10 + info.minorVersion();
|
||||
|
||||
if (!TargetArchs::builtWith(GLOBAL_ATOMICS) || !DeviceInfo().supports(GLOBAL_ATOMICS))
|
||||
CV_Error(CV_StsNotImplemented, "The device doesn't support global atomics");
|
||||
|
||||
const int nQuery = query.rows;
|
||||
const int nTrain = train.rows;
|
||||
|
||||
@@ -814,7 +796,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchSingle(const GpuMat& query,
|
||||
caller_t func = callers[distType][query.depth()];
|
||||
CV_Assert(func != 0);
|
||||
|
||||
func(query, train, maxDistance, mask, trainIdx, distance, nMatches, cc, StreamAccessor::getStream(stream));
|
||||
func(query, train, maxDistance, mask, trainIdx, distance, nMatches, StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& distance, const GpuMat& nMatches,
|
||||
@@ -897,7 +879,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchCollection(const GpuMat& qu
|
||||
|
||||
typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
|
||||
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
|
||||
int cc, cudaStream_t stream);
|
||||
cudaStream_t stream);
|
||||
|
||||
static const caller_t callers[3][6] =
|
||||
{
|
||||
@@ -918,12 +900,6 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchCollection(const GpuMat& qu
|
||||
}
|
||||
};
|
||||
|
||||
DeviceInfo info;
|
||||
int cc = info.majorVersion() * 10 + info.minorVersion();
|
||||
|
||||
if (!TargetArchs::builtWith(GLOBAL_ATOMICS) || !DeviceInfo().supports(GLOBAL_ATOMICS))
|
||||
CV_Error(CV_StsNotImplemented, "The device doesn't support global atomics");
|
||||
|
||||
const int nQuery = query.rows;
|
||||
|
||||
CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
|
||||
@@ -949,7 +925,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchCollection(const GpuMat& qu
|
||||
vector<PtrStepSzb> masks_(masks.begin(), masks.end());
|
||||
|
||||
func(query, &trains_[0], static_cast<int>(trains_.size()), maxDistance, masks_.size() == 0 ? 0 : &masks_[0],
|
||||
trainIdx, imgIdx, distance, nMatches, cc, StreamAccessor::getStream(stream));
|
||||
trainIdx, imgIdx, distance, nMatches, StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, const GpuMat& nMatches,
|
||||
|
||||
Reference in New Issue
Block a user