diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index 1b09f4c77..04a8385f0 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -1288,16 +1288,16 @@ namespace cv const std::vector& masks = std::vector(), bool compactResult = false ); // Find best matches for each query descriptor which have distance less than maxDistance. - // nMatches.at(0, queruIdx) will contain matches count for queryIdx. + // nMatches.at(0, queryIdx) will contain matches count for queryIdx. // carefully nMatches can be greater than trainIdx.cols - it means that matcher didn't find all matches, // because it didn't have enough memory. - // trainIdx.at(queruIdx, i) will contain ith train index (i < min(nMatches.at(0, queruIdx), trainIdx.cols)) - // distance.at(queruIdx, i) will contain ith distance (i < min(nMatches.at(0, queruIdx), trainIdx.cols)) - // If trainIdx is empty, then trainIdx and distance will be created with size nQuery x nTrain, + // trainIdx.at(queruIdx, i) will contain ith train index (i < min(nMatches.at(0, queruIdx), trainIdx.cols)) + // distance.at(queruIdx, i) will contain ith distance (i < min(nMatches.at(0, queruIdx), trainIdx.cols)) + // If trainIdx is empty, then trainIdx and distance will be created with size nQuery x (nTrain / 2), // otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches // Matches doesn't sorted. - void radiusMatch(const GpuMat& queryDescs, const GpuMat& trainDescs, - GpuMat& trainIdx, GpuMat& nMatches, GpuMat& distance, float maxDistance, + void radiusMatchSingle(const GpuMat& queryDescs, const GpuMat& trainDescs, + GpuMat& trainIdx, GpuMat& distance, GpuMat& nMatches, float maxDistance, const GpuMat& mask = GpuMat(), Stream& stream = Stream::Null()); // Download trainIdx, nMatches and distance and convert it to vector with DMatch. @@ -1305,10 +1305,10 @@ namespace cv // compactResult is used when mask is not empty. If compactResult is false matches // vector will have the same size as queryDescriptors rows. If compactResult is true // matches vector will not contain matches for fully masked out query descriptors. - static void radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& nMatches, const GpuMat& distance, + static void radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& distance, const GpuMat& nMatches, std::vector< std::vector >& matches, bool compactResult = false); // Convert trainIdx, nMatches and distance to vector with DMatch. - static void radiusMatchConvert(const Mat& trainIdx, const Mat& nMatches, const Mat& distance, + static void radiusMatchConvert(const Mat& trainIdx, const Mat& distance, const Mat& nMatches, std::vector< std::vector >& matches, bool compactResult = false); // Find best matches for each query descriptor which have distance less than maxDistance @@ -1317,6 +1317,23 @@ namespace cv std::vector< std::vector >& matches, float maxDistance, const GpuMat& mask = GpuMat(), bool compactResult = false); + // Find best matches for each query descriptor which have distance less than maxDistance. + // Matches doesn't sorted. + void radiusMatchCollection(const GpuMat& queryDescs, const GpuMat& trainCollection, + GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, GpuMat& nMatches, float maxDistance, + const GpuMat& maskCollection, Stream& stream = Stream::Null()); + + // Download trainIdx, imgIdx, nMatches and distance and convert it to vector with DMatch. + // matches will be sorted in increasing order of distances. + // compactResult is used when mask is not empty. If compactResult is false matches + // vector will have the same size as queryDescriptors rows. If compactResult is true + // matches vector will not contain matches for fully masked out query descriptors. + static void radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, const GpuMat& nMatches, + std::vector< std::vector >& matches, bool compactResult = false); + // Convert trainIdx, nMatches and distance to vector with DMatch. + static void radiusMatchConvert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance, const Mat& nMatches, + std::vector< std::vector >& matches, bool compactResult = false); + // Find best matches from train collection for each query descriptor which have distance less than // maxDistance (in increasing order of distances). void radiusMatch(const GpuMat& queryDescs, std::vector< std::vector >& matches, float maxDistance, diff --git a/modules/gpu/perf/perf_features2d.cpp b/modules/gpu/perf/perf_features2d.cpp index d2505374d..c29f3ebe2 100644 --- a/modules/gpu/perf/perf_features2d.cpp +++ b/modules/gpu/perf/perf_features2d.cpp @@ -89,7 +89,7 @@ PERF_TEST_P(DevInfo_DescSize, BruteForceMatcher_radiusMatch, testing::Combine(te SIMPLE_TEST_CYCLE() { - matcher.radiusMatch(query, train, trainIdx, nMatches, distance, 2.0); + matcher.radiusMatchSingle(query, train, trainIdx, distance, nMatches, 2.0); } Mat trainIdx_host(trainIdx); diff --git a/modules/gpu/src/brute_force_matcher.cpp b/modules/gpu/src/brute_force_matcher.cpp index be081e9fc..24d20d63b 100644 --- a/modules/gpu/src/brute_force_matcher.cpp +++ b/modules/gpu/src/brute_force_matcher.cpp @@ -68,10 +68,13 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchDownload(const GpuMat&, const void cv::gpu::BruteForceMatcher_GPU_base::knnMatchConvert(const Mat&, const Mat&, std::vector< std::vector >&, bool) { throw_nogpu(); } void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat&, const GpuMat&, std::vector< std::vector >&, int, const GpuMat&, bool) { throw_nogpu(); } void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat&, std::vector< std::vector >&, int, const std::vector&, bool) { throw_nogpu(); } -void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, float, const GpuMat&, Stream&) { throw_nogpu(); } +void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchSingle(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, float, const GpuMat&, Stream&) { throw_nogpu(); } void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat&, const GpuMat&, const GpuMat&, std::vector< std::vector >&, bool) { throw_nogpu(); } void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchConvert(const Mat&, const Mat&, const Mat&, std::vector< std::vector >&, bool) { throw_nogpu(); } void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat&, const GpuMat&, std::vector< std::vector >&, float, const GpuMat&, bool) { throw_nogpu(); } +void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchCollection(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, float, const GpuMat&, Stream&) { throw_nogpu(); } +void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat&, const GpuMat&, const GpuMat&, const GpuMat&, vector< vector >&, bool) { throw_nogpu(); } +void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchConvert(const Mat&, const Mat&, const Mat&, const Mat&, vector< vector >&, bool) { throw_nogpu(); } void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat&, std::vector< std::vector >&, float, const std::vector&, bool) { throw_nogpu(); } #else /* !defined (HAVE_CUDA) */ @@ -114,27 +117,27 @@ namespace cv { namespace gpu { namespace bf_knnmatch namespace cv { namespace gpu { namespace bf_radius_match { - template void radiusMatchL1_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, - const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, + template void radiusMatchSingleL1_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, + const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); - template void radiusMatchL2_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, - const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, + template void radiusMatchSingleL2_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, + const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); - template void radiusMatchHamming_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, - const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, + template void radiusMatchSingleHamming_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, + const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches, + cudaStream_t stream); + + template void radiusMatchCollectionL1_gpu(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_& maskCollection, + const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches, + cudaStream_t stream); + template void radiusMatchCollectionL2_gpu(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_& maskCollection, + const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches, + cudaStream_t stream); + template void radiusMatchCollectionHamming_gpu(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_& maskCollection, + const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); }}} -namespace -{ - struct ImgIdxSetter - { - explicit inline ImgIdxSetter(int imgIdx_) : imgIdx(imgIdx_) {} - inline void operator()(DMatch& m) const {m.imgIdx = imgIdx;} - int imgIdx; - }; -} - cv::gpu::BruteForceMatcher_GPU_base::BruteForceMatcher_GPU_base(DistType distType_) : distType(distType_) { } @@ -551,6 +554,16 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, con knnMatchDownload(trainIdx, distance, matches, compactResult); } +namespace +{ + struct ImgIdxSetter + { + explicit inline ImgIdxSetter(int imgIdx_) : imgIdx(imgIdx_) {} + inline void operator()(DMatch& m) const {m.imgIdx = imgIdx;} + int imgIdx; + }; +} + void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, vector< vector >& matches, int knn, const vector& masks, bool compactResult) { @@ -596,8 +609,8 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, //////////////////////////////////////////////////////////////////// // RadiusMatch -void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs, const GpuMat& trainDescs, - GpuMat& trainIdx, GpuMat& nMatches, GpuMat& distance, float maxDistance, const GpuMat& mask, Stream& stream) +void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchSingle(const GpuMat& queryDescs, const GpuMat& trainDescs, + GpuMat& trainIdx, GpuMat& distance, GpuMat& nMatches, float maxDistance, const GpuMat& mask, Stream& stream) { if (queryDescs.empty() || trainDescs.empty()) return; @@ -605,26 +618,26 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs, using namespace cv::gpu::bf_radius_match; typedef void (*radiusMatch_caller_t)(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, - const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, + const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); static const radiusMatch_caller_t radiusMatch_callers[3][8] = { { - radiusMatchL1_gpu, 0/*radiusMatchL1_gpu*/, radiusMatchL1_gpu, - radiusMatchL1_gpu, radiusMatchL1_gpu, radiusMatchL1_gpu, 0, 0 + radiusMatchSingleL1_gpu, 0/*radiusMatchSingleL1_gpu*/, radiusMatchSingleL1_gpu, + radiusMatchSingleL1_gpu, radiusMatchSingleL1_gpu, radiusMatchSingleL1_gpu, 0, 0 }, { - 0/*radiusMatchL2_gpu*/, 0/*radiusMatchL2_gpu*/, 0/*radiusMatchL2_gpu*/, - 0/*radiusMatchL2_gpu*/, 0/*radiusMatchL2_gpu*/, radiusMatchL2_gpu, 0, 0 + 0/*radiusMatchSingleL2_gpu*/, 0/*radiusMatchSingleL2_gpu*/, 0/*radiusMatchSingleL2_gpu*/, + 0/*radiusMatchSingleL2_gpu*/, 0/*radiusMatchSingleL2_gpu*/, radiusMatchSingleL2_gpu, 0, 0 }, { - radiusMatchHamming_gpu, 0/*radiusMatchHamming_gpu*/, radiusMatchHamming_gpu, - 0/*radiusMatchHamming_gpu*/, radiusMatchHamming_gpu, 0, 0, 0 + radiusMatchSingleHamming_gpu, 0/*radiusMatchSingleHamming_gpu*/, radiusMatchSingleHamming_gpu, + 0/*radiusMatchSingleHamming_gpu*/, radiusMatchSingleHamming_gpu, 0, 0, 0 } }; - CV_Assert(TargetArchs::builtWith(SHARED_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS)); + CV_Assert(TargetArchs::builtWith(SHARED_ATOMICS) && DeviceInfo().supports(SHARED_ATOMICS)); const int nQuery = queryDescs.rows; const int nTrain = trainDescs.rows; @@ -636,38 +649,33 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs, ensureSizeIsEnough(1, nQuery, CV_32SC1, nMatches); if (trainIdx.empty()) { - ensureSizeIsEnough(nQuery, nTrain, CV_32SC1, trainIdx); - ensureSizeIsEnough(nQuery, nTrain, CV_32FC1, distance); + ensureSizeIsEnough(nQuery, nTrain / 2, CV_32SC1, trainIdx); + ensureSizeIsEnough(nQuery, nTrain / 2, CV_32FC1, distance); } - if (stream) - stream.enqueueMemSet(nMatches, Scalar::all(0)); - else - nMatches.setTo(Scalar::all(0)); - radiusMatch_caller_t func = radiusMatch_callers[distType][queryDescs.depth()]; CV_Assert(func != 0); - func(queryDescs, trainDescs, maxDistance, mask, trainIdx, nMatches, distance, StreamAccessor::getStream(stream)); + func(queryDescs, trainDescs, maxDistance, mask, trainIdx, distance, nMatches, StreamAccessor::getStream(stream)); } -void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& nMatches, - const GpuMat& distance, std::vector< std::vector >& matches, bool compactResult) +void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& distance, const GpuMat& nMatches, + vector< vector >& matches, bool compactResult) { - if (trainIdx.empty() || nMatches.empty() || distance.empty()) + if (trainIdx.empty() || distance.empty() || nMatches.empty()) return; Mat trainIdxCPU = trainIdx; - Mat nMatchesCPU = nMatches; Mat distanceCPU = distance; + Mat nMatchesCPU = nMatches; - radiusMatchConvert(trainIdxCPU, nMatchesCPU, distanceCPU, matches, compactResult); + radiusMatchConvert(trainIdxCPU, distanceCPU, nMatchesCPU, matches, compactResult); } -void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchConvert(const Mat& trainIdx, const Mat& nMatches, const Mat& distance, - std::vector< std::vector >& matches, bool compactResult) +void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchConvert(const Mat& trainIdx, const Mat& distance, const Mat& nMatches, + vector< vector >& matches, bool compactResult) { - if (trainIdx.empty() || nMatches.empty() || distance.empty()) + if (trainIdx.empty() || distance.empty() || nMatches.empty()) return; CV_Assert(trainIdx.type() == CV_32SC1); @@ -679,13 +687,135 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchConvert(const Mat& trainIdx matches.clear(); matches.reserve(nQuery); - const unsigned int* nMatches_ptr = nMatches.ptr(); + const int* nMatches_ptr = nMatches.ptr(); for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx) { const int* trainIdx_ptr = trainIdx.ptr(queryIdx); const float* distance_ptr = distance.ptr(queryIdx); - const int nMatches = std::min(static_cast(nMatches_ptr[queryIdx]), trainIdx.cols); + const int nMatches = std::min(nMatches_ptr[queryIdx], trainIdx.cols); + + if (nMatches == 0) + { + if (!compactResult) + matches.push_back(vector()); + continue; + } + + matches.push_back(vector(nMatches)); + vector& curMatches = matches.back(); + + for (int i = 0; i < nMatches; ++i, ++trainIdx_ptr, ++distance_ptr) + { + int trainIdx = *trainIdx_ptr; + + float distance = *distance_ptr; + + DMatch m(queryIdx, trainIdx, 0, distance); + + curMatches[i] = m; + } + + sort(curMatches.begin(), curMatches.end()); + } +} + +void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs, const GpuMat& trainDescs, + vector< vector >& matches, float maxDistance, const GpuMat& mask, bool compactResult) +{ + GpuMat trainIdx, distance, nMatches; + radiusMatchSingle(queryDescs, trainDescs, trainIdx, distance, nMatches, maxDistance, mask); + radiusMatchDownload(trainIdx, distance, nMatches, matches, compactResult); +} + +void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchCollection(const GpuMat& queryDescs, const GpuMat& trainCollection, + GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, GpuMat& nMatches, float maxDistance, + const GpuMat& maskCollection, Stream& stream) +{ + if (queryDescs.empty() || trainCollection.empty()) + return; + + using namespace cv::gpu::bf_radius_match; + + typedef void (*radiusMatch_caller_t)(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_& maskCollection, + const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches, + cudaStream_t stream); + + static const radiusMatch_caller_t radiusMatch_callers[3][8] = + { + { + radiusMatchCollectionL1_gpu, 0/*radiusMatchCollectionL1_gpu*/, radiusMatchCollectionL1_gpu, + radiusMatchCollectionL1_gpu, radiusMatchCollectionL1_gpu, radiusMatchCollectionL1_gpu, 0, 0 + }, + { + 0/*radiusMatchCollectionL2_gpu*/, 0/*radiusMatchCollectionL2_gpu*/, 0/*radiusMatchCollectionL2_gpu*/, + 0/*radiusMatchCollectionL2_gpu*/, 0/*radiusMatchCollectionL2_gpu*/, radiusMatchCollectionL2_gpu, 0, 0 + }, + { + radiusMatchCollectionHamming_gpu, 0/*radiusMatchCollectionHamming_gpu*/, radiusMatchCollectionHamming_gpu, + 0/*radiusMatchCollectionHamming_gpu*/, radiusMatchCollectionHamming_gpu, 0, 0, 0 + } + }; + + CV_Assert(TargetArchs::builtWith(SHARED_ATOMICS) && DeviceInfo().supports(SHARED_ATOMICS)); + + const int nQuery = queryDescs.rows; + + CV_Assert(queryDescs.channels() == 1 && queryDescs.depth() < CV_64F); + CV_Assert(trainIdx.empty() || (trainIdx.rows == nQuery && trainIdx.size() == distance.size() && trainIdx.size() == imgIdx.size())); + + ensureSizeIsEnough(1, nQuery, CV_32SC1, nMatches); + if (trainIdx.empty()) + { + ensureSizeIsEnough(nQuery, nQuery / 2, CV_32SC1, trainIdx); + ensureSizeIsEnough(nQuery, nQuery / 2, CV_32SC1, imgIdx); + ensureSizeIsEnough(nQuery, nQuery / 2, CV_32FC1, distance); + } + + radiusMatch_caller_t func = radiusMatch_callers[distType][queryDescs.depth()]; + CV_Assert(func != 0); + + func(queryDescs, trainCollection, maxDistance, maskCollection, trainIdx, imgIdx, distance, nMatches, StreamAccessor::getStream(stream)); +} + +void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, const GpuMat& nMatches, + vector< vector >& matches, bool compactResult) +{ + if (trainIdx.empty() || imgIdx.empty() || distance.empty() || nMatches.empty()) + return; + + Mat trainIdxCPU = trainIdx; + Mat imgIdxCPU = imgIdx; + Mat distanceCPU = distance; + Mat nMatchesCPU = nMatches; + + radiusMatchConvert(trainIdxCPU, imgIdxCPU, distanceCPU, nMatchesCPU, matches, compactResult); +} + +void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchConvert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance, const Mat& nMatches, + vector< vector >& matches, bool compactResult) +{ + if (trainIdx.empty() || imgIdx.empty() || distance.empty() || nMatches.empty()) + return; + + CV_Assert(trainIdx.type() == CV_32SC1); + CV_Assert(imgIdx.type() == CV_32SC1 && imgIdx.size() == trainIdx.size()); + CV_Assert(distance.type() == CV_32FC1 && distance.size() == trainIdx.size()); + CV_Assert(nMatches.type() == CV_32SC1 && nMatches.isContinuous() && nMatches.cols >= trainIdx.rows); + + const int nQuery = trainIdx.rows; + + matches.clear(); + matches.reserve(nQuery); + + const int* nMatches_ptr = nMatches.ptr(); + for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx) + { + const int* trainIdx_ptr = trainIdx.ptr(queryIdx); + const int* imgIdx_ptr = imgIdx.ptr(queryIdx); + const float* distance_ptr = distance.ptr(queryIdx); + + const int nMatches = std::min(nMatches_ptr[queryIdx], trainIdx.cols); if (nMatches == 0) { @@ -698,63 +828,34 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchConvert(const Mat& trainIdx vector& curMatches = matches.back(); curMatches.reserve(nMatches); - for (int i = 0; i < nMatches; ++i, ++trainIdx_ptr, ++distance_ptr) + for (int i = 0; i < nMatches; ++i, ++trainIdx_ptr, ++imgIdx_ptr, ++distance_ptr) { int trainIdx = *trainIdx_ptr; - + int imgIdx = *imgIdx_ptr; float distance = *distance_ptr; - DMatch m(queryIdx, trainIdx, 0, distance); + DMatch m(queryIdx, trainIdx, imgIdx, distance); curMatches.push_back(m); } + sort(curMatches.begin(), curMatches.end()); } } -void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs, const GpuMat& trainDescs, - vector< vector >& matches, float maxDistance, const GpuMat& mask, bool compactResult) -{ - GpuMat trainIdx, nMatches, distance; - radiusMatch(queryDescs, trainDescs, trainIdx, nMatches, distance, maxDistance, mask); - radiusMatchDownload(trainIdx, nMatches, distance, matches, compactResult); -} - void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs, vector< vector >& matches, float maxDistance, const vector& masks, bool compactResult) { - if (queryDescs.empty() || empty()) - return; + GpuMat trainCollection; + GpuMat maskCollection; - matches.resize(queryDescs.rows); + makeGpuCollection(trainCollection, maskCollection, masks); - vector< vector > curMatches; + GpuMat trainIdx, imgIdx, distance, nMatches; - for (size_t imgIdx = 0; imgIdx < trainDescCollection.size(); ++imgIdx) - { - radiusMatch(queryDescs, trainDescCollection[imgIdx], curMatches, maxDistance, - masks.empty() ? GpuMat() : masks[imgIdx]); + radiusMatchCollection(queryDescs, trainCollection, trainIdx, imgIdx, distance, nMatches, maxDistance, maskCollection); - for (int queryIdx = 0; queryIdx < queryDescs.rows; ++queryIdx) - { - vector& localMatch = curMatches[queryIdx]; - vector& globalMatch = matches[queryIdx]; - - for_each(localMatch.begin(), localMatch.end(), ImgIdxSetter(static_cast(imgIdx))); - - const size_t oldSize = globalMatch.size(); - - copy(localMatch.begin(), localMatch.end(), back_inserter(globalMatch)); - inplace_merge(globalMatch.begin(), globalMatch.begin() + oldSize, globalMatch.end()); - } - } - - if (compactResult) - { - vector< vector >::iterator new_end = remove_if(matches.begin(), matches.end(), - mem_fun_ref(&vector::empty)); - matches.erase(new_end, matches.end()); - } + radiusMatchDownload(trainIdx, imgIdx, distance, nMatches, matches, compactResult); } #endif /* !defined (HAVE_CUDA) */ diff --git a/modules/gpu/src/cuda/bf_radius_match.cu b/modules/gpu/src/cuda/bf_radius_match.cu index bd6645117..1c1dace75 100644 --- a/modules/gpu/src/cuda/bf_radius_match.cu +++ b/modules/gpu/src/cuda/bf_radius_match.cu @@ -49,94 +49,210 @@ using namespace cv::gpu::device; namespace cv { namespace gpu { namespace bf_radius_match { - __device__ __forceinline__ void store(const int* sidx, const float* sdist, const unsigned int scount, int* trainIdx, float* distance, int& sglob_ind, const int tid) + template struct SingleTrain { - if (tid < scount) + enum {USE_IMG_IDX = 0}; + + explicit SingleTrain(const DevMem2D_& train_) : train(train_) { - trainIdx[sglob_ind + tid] = sidx[tid]; - distance[sglob_ind + tid] = sdist[tid]; } - if (tid == 0) - sglob_ind += scount; - } + static __device__ __forceinline__ void store(const int* s_trainIdx, const int* s_imgIdx, const float* s_dist, unsigned int& s_count, int& s_globInd, + int* trainIdx, int* imgIdx, float* distance, int maxCount) + { + const int tid = threadIdx.y * blockDim.x + threadIdx.x; - template - __global__ void radiusMatch(const PtrStep_ query, const DevMem2D_ train, const float maxDistance, const Mask mask, - DevMem2Di trainIdx_, PtrStepf distance, unsigned int* nMatches) + if (tid < s_count && s_globInd + tid < maxCount) + { + trainIdx[s_globInd + tid] = s_trainIdx[tid]; + distance[s_globInd + tid] = s_dist[tid]; + } + + if (tid == 0) + { + s_globInd += s_count; + s_count = 0; + } + } + + template + __device__ __forceinline__ void loop(float maxDistance, Mask& mask, const VecDiff& vecDiff, + int* s_trainIdx, int* s_imgIdx, float* s_dist, unsigned int& s_count, int& s_globInd, + int* trainIdxRow, int* imgIdxRow, float* distanceRow, int maxCount, + typename Dist::result_type* s_diffRow) const + { + #if __CUDA_ARCH__ >= 120 + + for (int i = 0; i < train.rows; i += blockDim.y) + { + int trainIdx = i + threadIdx.y; + + if (trainIdx < train.rows && mask(blockIdx.x, trainIdx)) + { + Dist dist; + + vecDiff.calc(train.ptr(trainIdx), train.cols, dist, s_diffRow, threadIdx.x); + + const typename Dist::result_type val = dist; + + if (threadIdx.x == 0 && val < maxDistance) + { + unsigned int ind = atomicInc(&s_count, (unsigned int) -1); + s_trainIdx[ind] = trainIdx; + s_dist[ind] = val; + } + } + + __syncthreads(); + + if (s_count >= BLOCK_STACK - blockDim.y) + store(s_trainIdx, s_imgIdx, s_dist, s_count, s_globInd, trainIdxRow, imgIdxRow, distanceRow, maxCount); + + __syncthreads(); + } + + store(s_trainIdx, s_imgIdx, s_dist, s_count, s_globInd, trainIdxRow, imgIdxRow, distanceRow, maxCount); + + #endif + } + + __device__ __forceinline__ int descLen() const + { + return train.cols; + } + + const DevMem2D_ train; + }; + + template struct TrainCollection { - #if __CUDA_ARCH__ >= 120 + enum {USE_IMG_IDX = 1}; + TrainCollection(const DevMem2D_* trainCollection_, int nImg_, int desclen_) : + trainCollection(trainCollection_), nImg(nImg_), desclen(desclen_) + { + } + + static __device__ __forceinline__ void store(const int* s_trainIdx, const int* s_imgIdx, const float* s_dist, unsigned int& s_count, int& s_globInd, + int* trainIdx, int* imgIdx, float* distance, int maxCount) + { + const int tid = threadIdx.y * blockDim.x + threadIdx.x; + + if (tid < s_count && s_globInd + tid < maxCount) + { + trainIdx[s_globInd + tid] = s_trainIdx[tid]; + imgIdx[s_globInd + tid] = s_imgIdx[tid]; + distance[s_globInd + tid] = s_dist[tid]; + } + + if (tid == 0) + { + s_globInd += s_count; + s_count = 0; + } + } + + template + __device__ void loop(float maxDistance, Mask& mask, const VecDiff& vecDiff, + int* s_trainIdx, int* s_imgIdx, float* s_dist, unsigned int& s_count, int& s_globInd, + int* trainIdxRow, int* imgIdxRow, float* distanceRow, int maxCount, + typename Dist::result_type* s_diffRow) const + { + #if __CUDA_ARCH__ >= 120 + + for (int imgIdx = 0; imgIdx < nImg; ++imgIdx) + { + const DevMem2D_ train = trainCollection[imgIdx]; + + mask.next(); + + for (int i = 0; i < train.rows; i += blockDim.y) + { + int trainIdx = i + threadIdx.y; + + if (trainIdx < train.rows && mask(blockIdx.x, trainIdx)) + { + Dist dist; + + vecDiff.calc(train.ptr(trainIdx), desclen, dist, s_diffRow, threadIdx.x); + + const typename Dist::result_type val = dist; + + if (threadIdx.x == 0 && val < maxDistance) + { + unsigned int ind = atomicInc(&s_count, (unsigned int) -1); + s_trainIdx[ind] = trainIdx; + s_imgIdx[ind] = imgIdx; + s_dist[ind] = val; + } + } + + __syncthreads(); + + if (s_count >= BLOCK_STACK - blockDim.y) + store(s_trainIdx, s_imgIdx, s_dist, s_count, s_globInd, trainIdxRow, imgIdxRow, distanceRow, maxCount); + + __syncthreads(); + } + } + + store(s_trainIdx, s_imgIdx, s_dist, s_count, s_globInd, trainIdxRow, imgIdxRow, distanceRow, maxCount); + + #endif + } + + __device__ __forceinline__ int descLen() const + { + return desclen; + } + + const DevMem2D_* trainCollection; + const int nImg; + const int desclen; + }; + + template + __global__ void radiusMatch(const PtrStep_ query, const Train train, float maxDistance, const Mask mask, + PtrStepi trainIdx, PtrStepi imgIdx, PtrStepf distance, int* nMatches, int maxCount) + { typedef typename Dist::result_type result_type; typedef typename Dist::value_type value_type; - __shared__ result_type smem[BLOCK_DIM_X * BLOCK_DIM_Y]; - __shared__ int sidx[BLOCK_STACK]; - __shared__ float sdist[BLOCK_STACK]; - __shared__ unsigned int scount; - __shared__ int sglob_ind; + __shared__ result_type s_mem[BLOCK_DIM_X * BLOCK_DIM_Y]; - const int queryIdx = blockIdx.x; - const int tid = threadIdx.y * BLOCK_DIM_X + threadIdx.x; + __shared__ int s_trainIdx[BLOCK_STACK]; + __shared__ int s_imgIdx[Train::USE_IMG_IDX ? BLOCK_STACK : 1]; + __shared__ float s_dist[BLOCK_STACK]; + __shared__ unsigned int s_count; - if (tid == 0) + __shared__ int s_globInd; + + if (threadIdx.x == 0 && threadIdx.y == 0) { - scount = 0; - sglob_ind = 0; + s_count = 0; + s_globInd = 0; } __syncthreads(); - int* trainIdx_row = trainIdx_.ptr(queryIdx); - float* distance_row = distance.ptr(queryIdx); + const VecDiff vecDiff(query.ptr(blockIdx.x), train.descLen(), (typename Dist::value_type*)s_mem, threadIdx.y * BLOCK_DIM_X + threadIdx.x, threadIdx.x); - const VecDiff vecDiff(query.ptr(queryIdx), train.cols, (typename Dist::value_type*)smem, tid, threadIdx.x); - - typename Dist::result_type* sdiffRow = smem + BLOCK_DIM_X * threadIdx.y; + Mask m = mask; - for (int trainIdx = threadIdx.y; trainIdx < train.rows; trainIdx += BLOCK_DIM_Y) - { - if (mask(queryIdx, trainIdx)) - { - Dist dist; + train.template loop(maxDistance, m, vecDiff, + s_trainIdx, s_imgIdx, s_dist, s_count, s_globInd, + trainIdx.ptr(blockIdx.x), imgIdx.ptr(blockIdx.x), distance.ptr(blockIdx.x), maxCount, + s_mem + BLOCK_DIM_X * threadIdx.y); - const T* trainRow = train.ptr(trainIdx); - - vecDiff.calc(trainRow, train.cols, dist, sdiffRow, threadIdx.x); - - const typename Dist::result_type val = dist; - - if (threadIdx.x == 0 && val < maxDistance) - { - unsigned int i = atomicInc(&scount, (unsigned int) -1); - sidx[i] = trainIdx; - sdist[i] = val; - } - } - __syncthreads(); - - if (scount > BLOCK_STACK - BLOCK_DIM_Y) - { - store(sidx, sdist, scount, trainIdx_row, distance_row, sglob_ind, tid); - if (tid == 0) - scount = 0; - } - __syncthreads(); - } - - store(sidx, sdist, scount, trainIdx_row, distance_row, sglob_ind, tid); - - if (tid == 0) - nMatches[queryIdx] = sglob_ind; - - #endif + if (threadIdx.x == 0 && threadIdx.y == 0) + nMatches[blockIdx.x] = s_globInd; } /////////////////////////////////////////////////////////////////////////////// // Radius Match kernel caller - template - void radiusMatchSimple_caller(const DevMem2D_& query, const DevMem2D_& train, float maxDistance, const Mask& mask, - const DevMem2Di& trainIdx, const DevMem2Df& distance, unsigned int* nMatches, + template + void radiusMatchSimple_caller(const DevMem2D_& query, const Train& train, float maxDistance, const Mask& mask, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int* nMatches, cudaStream_t stream) { StaticAssert= BLOCK_DIM_Y>::check(); @@ -146,16 +262,16 @@ namespace cv { namespace gpu { namespace bf_radius_match const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); radiusMatch, Dist, T> - <<>>(query, train, maxDistance, mask, trainIdx, distance, nMatches); + <<>>(query, train, maxDistance, mask, trainIdx, imgIdx, distance, nMatches, trainIdx.cols); cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } - template - void radiusMatchCached_caller(const DevMem2D_& query, const DevMem2D_& train, float maxDistance, const Mask& mask, - const DevMem2Di& trainIdx, const DevMem2Df& distance, unsigned int* nMatches, + template + void radiusMatchCached_caller(const DevMem2D_& query, const Train& train, float maxDistance, const Mask& mask, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int* nMatches, cudaStream_t stream) { StaticAssert= BLOCK_DIM_Y>::check(); @@ -167,7 +283,7 @@ namespace cv { namespace gpu { namespace bf_radius_match const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); radiusMatch, Dist, T> - <<>>(query, train, maxDistance, mask, trainIdx, distance, nMatches); + <<>>(query, train, maxDistance, mask, trainIdx, imgIdx, distance, nMatches, trainIdx.cols); cudaSafeCall( cudaGetLastError() ); if (stream == 0) @@ -177,58 +293,58 @@ namespace cv { namespace gpu { namespace bf_radius_match /////////////////////////////////////////////////////////////////////////////// // Radius Match Dispatcher - template - void radiusMatchDispatcher(const DevMem2D_& query, const DevMem2D_& train, float maxDistance, const Mask& mask, - const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches, + template + void radiusMatchDispatcher(const DevMem2D_& query, const Train& train, float maxDistance, const Mask& mask, + const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream) { if (query.cols < 64) { radiusMatchCached_caller<16, 16, 64, 64, false, Dist>( query, train, maxDistance, mask, - static_cast(trainIdx), static_cast(distance), (unsigned int*)nMatches.data, + static_cast(trainIdx), static_cast(imgIdx), static_cast(distance), (int*)nMatches.data, stream); } else if (query.cols == 64) { radiusMatchCached_caller<16, 16, 64, 64, true, Dist>( query, train, maxDistance, mask, - static_cast(trainIdx), static_cast(distance), (unsigned int*)nMatches.data, + static_cast(trainIdx), static_cast(imgIdx), static_cast(distance), (int*)nMatches.data, stream); } else if (query.cols < 128) { radiusMatchCached_caller<16, 16, 64, 128, false, Dist>( query, train, maxDistance, mask, - static_cast(trainIdx), static_cast(distance), (unsigned int*)nMatches.data, + static_cast(trainIdx), static_cast(imgIdx), static_cast(distance), (int*)nMatches.data, stream); } else if (query.cols == 128) { radiusMatchCached_caller<16, 16, 64, 128, true, Dist>( query, train, maxDistance, mask, - static_cast(trainIdx), static_cast(distance), (unsigned int*)nMatches.data, + static_cast(trainIdx), static_cast(imgIdx), static_cast(distance), (int*)nMatches.data, stream); } else if (query.cols < 256) { radiusMatchCached_caller<16, 16, 64, 256, false, Dist>( query, train, maxDistance, mask, - static_cast(trainIdx), static_cast(distance), (unsigned int*)nMatches.data, + static_cast(trainIdx), static_cast(imgIdx), static_cast(distance), (int*)nMatches.data, stream); } else if (query.cols == 256) { radiusMatchCached_caller<16, 16, 64, 256, true, Dist>( query, train, maxDistance, mask, - static_cast(trainIdx), static_cast(distance), (unsigned int*)nMatches.data, + static_cast(trainIdx), static_cast(imgIdx), static_cast(distance), (int*)nMatches.data, stream); } else { radiusMatchSimple_caller<16, 16, 64, Dist>( query, train, maxDistance, mask, - static_cast(trainIdx), static_cast(distance), (unsigned int*)nMatches.data, + static_cast(trainIdx), static_cast(imgIdx), static_cast(distance), (int*)nMatches.data, stream); } } @@ -236,77 +352,163 @@ namespace cv { namespace gpu { namespace bf_radius_match /////////////////////////////////////////////////////////////////////////////// // Radius Match caller - template void radiusMatchL1_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, - const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, + template void radiusMatchSingleL1_gpu(const DevMem2D& query, const DevMem2D& train_, float maxDistance, const DevMem2D& mask, + const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream) { + SingleTrain train(static_cast< DevMem2D_ >(train_)); + if (mask.data) { - radiusMatchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, SingleMask(mask), - trainIdx, distance, nMatches, + radiusMatchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), train, maxDistance, SingleMask(mask), + trainIdx, DevMem2D(), distance, nMatches, stream); } else { - radiusMatchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, WithOutMask(), - trainIdx, distance, nMatches, + radiusMatchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), train, maxDistance, WithOutMask(), + trainIdx, DevMem2D(), distance, nMatches, stream); } } - template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); - //template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); - template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); - template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); - template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); - template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); + template void radiusMatchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); + //template void radiusMatchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); + template void radiusMatchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); + template void radiusMatchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); + template void radiusMatchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); + template void radiusMatchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); - template void radiusMatchL2_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, - const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, + template void radiusMatchSingleL2_gpu(const DevMem2D& query, const DevMem2D& train_, float maxDistance, const DevMem2D& mask, + const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream) { + SingleTrain train(static_cast< DevMem2D_ >(train_)); + if (mask.data) { - radiusMatchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, SingleMask(mask), - trainIdx, distance, nMatches, + radiusMatchDispatcher(static_cast< DevMem2D_ >(query), train, maxDistance, SingleMask(mask), + trainIdx, DevMem2D(), distance, nMatches, stream); } else { - radiusMatchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, WithOutMask(), - trainIdx, distance, nMatches, + radiusMatchDispatcher(static_cast< DevMem2D_ >(query), train, maxDistance, WithOutMask(), + trainIdx, DevMem2D(), distance, nMatches, stream); } } - //template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); - //template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); - //template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); - //template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); - //template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); - template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); + //template void radiusMatchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); + //template void radiusMatchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); + //template void radiusMatchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); + //template void radiusMatchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); + //template void radiusMatchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); + template void radiusMatchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); - template void radiusMatchHamming_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, - const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, + template void radiusMatchSingleHamming_gpu(const DevMem2D& query, const DevMem2D& train_, float maxDistance, const DevMem2D& mask, + const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream) { + SingleTrain train(static_cast< DevMem2D_ >(train_)); + if (mask.data) { - radiusMatchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, SingleMask(mask), - trainIdx, distance, nMatches, + radiusMatchDispatcher(static_cast< DevMem2D_ >(query), train, maxDistance, SingleMask(mask), + trainIdx, DevMem2D(), distance, nMatches, stream); } else { - radiusMatchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, WithOutMask(), - trainIdx, distance, nMatches, + radiusMatchDispatcher(static_cast< DevMem2D_ >(query), train, maxDistance, WithOutMask(), + trainIdx, DevMem2D(), distance, nMatches, stream); } } - template void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); - //template void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); - template void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); - //template void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); - template void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); + template void radiusMatchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); + //template void radiusMatchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); + template void radiusMatchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); + //template void radiusMatchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); + template void radiusMatchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); + + template void radiusMatchCollectionL1_gpu(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_& maskCollection, + const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches, + cudaStream_t stream) + { + TrainCollection train((DevMem2D_*)trainCollection.ptr(), trainCollection.cols, query.cols); + + if (maskCollection.data) + { + radiusMatchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), train, maxDistance, MaskCollection(maskCollection.data), + trainIdx, imgIdx, distance, nMatches, + stream); + } + else + { + radiusMatchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), train, maxDistance, WithOutMask(), + trainIdx, imgIdx, distance, nMatches, + stream); + } + } + + template void radiusMatchCollectionL1_gpu(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); + //template void radiusMatchCollectionL1_gpu(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); + template void radiusMatchCollectionL1_gpu(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); + template void radiusMatchCollectionL1_gpu(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); + template void radiusMatchCollectionL1_gpu(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); + template void radiusMatchCollectionL1_gpu(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); + + template void radiusMatchCollectionL2_gpu(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_& maskCollection, + const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches, + cudaStream_t stream) + { + TrainCollection train((DevMem2D_*)trainCollection.ptr(), trainCollection.cols, query.cols); + + if (maskCollection.data) + { + radiusMatchDispatcher(static_cast< DevMem2D_ >(query), train, maxDistance, MaskCollection(maskCollection.data), + trainIdx, imgIdx, distance, nMatches, + stream); + } + else + { + radiusMatchDispatcher(static_cast< DevMem2D_ >(query), train, maxDistance, WithOutMask(), + trainIdx, imgIdx, distance, nMatches, + stream); + } + } + + //template void radiusMatchCollectionL2_gpu(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); + //template void radiusMatchCollectionL2_gpu(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); + //template void radiusMatchCollectionL2_gpu(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); + //template void radiusMatchCollectionL2_gpu(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); + //template void radiusMatchCollectionL2_gpu(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); + template void radiusMatchCollectionL2_gpu(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); + + template void radiusMatchCollectionHamming_gpu(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_& maskCollection, + const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches, + cudaStream_t stream) + { + TrainCollection train((DevMem2D_*)trainCollection.ptr(), trainCollection.cols, query.cols); + + if (maskCollection.data) + { + radiusMatchDispatcher(static_cast< DevMem2D_ >(query), train, maxDistance, MaskCollection(maskCollection.data), + trainIdx, imgIdx, distance, nMatches, + stream); + } + else + { + radiusMatchDispatcher(static_cast< DevMem2D_ >(query), train, maxDistance, WithOutMask(), + trainIdx, imgIdx, distance, nMatches, + stream); + } + } + + template void radiusMatchCollectionHamming_gpu(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); + //template void radiusMatchCollectionHamming_gpu(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); + template void radiusMatchCollectionHamming_gpu(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); + //template void radiusMatchCollectionHamming_gpu(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); + template void radiusMatchCollectionHamming_gpu(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); }}}