optimizations:
- new reduce implementation (with kepler optimizations) - saturate_cast via asm command - video SIMD instructions in element operations - float arithmetics instead of double - new deviceSupports function
This commit is contained in:
		| @@ -88,71 +88,71 @@ namespace cv { namespace gpu { namespace device | ||||
|     { | ||||
|         template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask, | ||||
|             const PtrStepSzi& trainIdx, const PtrStepSzf& distance, | ||||
|             int cc, cudaStream_t stream); | ||||
|             cudaStream_t stream); | ||||
|         template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask, | ||||
|             const PtrStepSzi& trainIdx, const PtrStepSzf& distance, | ||||
|             int cc, cudaStream_t stream); | ||||
|             cudaStream_t stream); | ||||
|         template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask, | ||||
|             const PtrStepSzi& trainIdx, const PtrStepSzf& distance, | ||||
|             int cc, cudaStream_t stream); | ||||
|             cudaStream_t stream); | ||||
|  | ||||
|         template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, | ||||
|             const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, | ||||
|             int cc, cudaStream_t stream); | ||||
|             cudaStream_t stream); | ||||
|         template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, | ||||
|             const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, | ||||
|             int cc, cudaStream_t stream); | ||||
|             cudaStream_t stream); | ||||
|         template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, | ||||
|             const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, | ||||
|             int cc, cudaStream_t stream); | ||||
|             cudaStream_t stream); | ||||
|     } | ||||
|  | ||||
|     namespace bf_knnmatch | ||||
|     { | ||||
|         template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask, | ||||
|             const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, | ||||
|             int cc, cudaStream_t stream); | ||||
|             cudaStream_t stream); | ||||
|         template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask, | ||||
|             const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, | ||||
|             int cc, cudaStream_t stream); | ||||
|             cudaStream_t stream); | ||||
|         template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask, | ||||
|             const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, | ||||
|             int cc, cudaStream_t stream); | ||||
|             cudaStream_t stream); | ||||
|  | ||||
|         template <typename T> void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, | ||||
|             const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, | ||||
|             int cc, cudaStream_t stream); | ||||
|             cudaStream_t stream); | ||||
|         template <typename T> void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, | ||||
|             const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, | ||||
|             int cc, cudaStream_t stream); | ||||
|             cudaStream_t stream); | ||||
|         template <typename T> void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, | ||||
|             const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, | ||||
|             int cc, cudaStream_t stream); | ||||
|             cudaStream_t stream); | ||||
|     } | ||||
|  | ||||
|     namespace bf_radius_match | ||||
|     { | ||||
|         template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask, | ||||
|             const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, | ||||
|             int cc, cudaStream_t stream); | ||||
|             cudaStream_t stream); | ||||
|         template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask, | ||||
|             const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, | ||||
|             int cc, cudaStream_t stream); | ||||
|             cudaStream_t stream); | ||||
|         template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask, | ||||
|             const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, | ||||
|             int cc, cudaStream_t stream); | ||||
|             cudaStream_t stream); | ||||
|  | ||||
|         template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, | ||||
|             const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, | ||||
|             int cc, cudaStream_t stream); | ||||
|             cudaStream_t stream); | ||||
|  | ||||
|         template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, | ||||
|             const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, | ||||
|             int cc, cudaStream_t stream); | ||||
|             cudaStream_t stream); | ||||
|  | ||||
|         template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, | ||||
|             const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, | ||||
|             int cc, cudaStream_t stream); | ||||
|             cudaStream_t stream); | ||||
|     } | ||||
| }}} | ||||
|  | ||||
| @@ -198,11 +198,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& query, const | ||||
|     if (query.empty() || train.empty()) | ||||
|         return; | ||||
|  | ||||
|     using namespace ::cv::gpu::device::bf_match; | ||||
|     using namespace cv::gpu::device::bf_match; | ||||
|  | ||||
|     typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask, | ||||
|                              const PtrStepSzi& trainIdx, const PtrStepSzf& distance, | ||||
|                              int cc, cudaStream_t stream); | ||||
|                              cudaStream_t stream); | ||||
|  | ||||
|     static const caller_t callers[3][6] = | ||||
|     { | ||||
| @@ -234,10 +234,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& query, const | ||||
|     caller_t func = callers[distType][query.depth()]; | ||||
|     CV_Assert(func != 0); | ||||
|  | ||||
|     DeviceInfo info; | ||||
|     int cc = info.majorVersion() * 10 + info.minorVersion(); | ||||
|  | ||||
|     func(query, train, mask, trainIdx, distance, cc, StreamAccessor::getStream(stream)); | ||||
|     func(query, train, mask, trainIdx, distance, StreamAccessor::getStream(stream)); | ||||
| } | ||||
|  | ||||
| void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& distance, vector<DMatch>& matches) | ||||
| @@ -340,11 +337,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& query, c | ||||
|     if (query.empty() || trainCollection.empty()) | ||||
|         return; | ||||
|  | ||||
|     using namespace ::cv::gpu::device::bf_match; | ||||
|     using namespace cv::gpu::device::bf_match; | ||||
|  | ||||
|     typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, | ||||
|                              const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, | ||||
|                              int cc, cudaStream_t stream); | ||||
|                              cudaStream_t stream); | ||||
|  | ||||
|     static const caller_t callers[3][6] = | ||||
|     { | ||||
| @@ -376,10 +373,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& query, c | ||||
|     caller_t func = callers[distType][query.depth()]; | ||||
|     CV_Assert(func != 0); | ||||
|  | ||||
|     DeviceInfo info; | ||||
|     int cc = info.majorVersion() * 10 + info.minorVersion(); | ||||
|  | ||||
|     func(query, trainCollection, masks, trainIdx, imgIdx, distance, cc, StreamAccessor::getStream(stream)); | ||||
|     func(query, trainCollection, masks, trainIdx, imgIdx, distance, StreamAccessor::getStream(stream)); | ||||
| } | ||||
|  | ||||
| void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, vector<DMatch>& matches) | ||||
| @@ -451,11 +445,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchSingle(const GpuMat& query, co | ||||
|     if (query.empty() || train.empty()) | ||||
|         return; | ||||
|  | ||||
|     using namespace ::cv::gpu::device::bf_knnmatch; | ||||
|     using namespace cv::gpu::device::bf_knnmatch; | ||||
|  | ||||
|     typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask, | ||||
|                              const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, | ||||
|                              int cc, cudaStream_t stream); | ||||
|                              cudaStream_t stream); | ||||
|  | ||||
|     static const caller_t callers[3][6] = | ||||
|     { | ||||
| @@ -502,10 +496,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchSingle(const GpuMat& query, co | ||||
|     caller_t func = callers[distType][query.depth()]; | ||||
|     CV_Assert(func != 0); | ||||
|  | ||||
|     DeviceInfo info; | ||||
|     int cc = info.majorVersion() * 10 + info.minorVersion(); | ||||
|  | ||||
|     func(query, train, k, mask, trainIdx, distance, allDist, cc, StreamAccessor::getStream(stream)); | ||||
|     func(query, train, k, mask, trainIdx, distance, allDist, StreamAccessor::getStream(stream)); | ||||
| } | ||||
|  | ||||
| void cv::gpu::BruteForceMatcher_GPU_base::knnMatchDownload(const GpuMat& trainIdx, const GpuMat& distance, | ||||
| @@ -580,11 +571,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Collection(const GpuMat& quer | ||||
|     if (query.empty() || trainCollection.empty()) | ||||
|         return; | ||||
|  | ||||
|     using namespace ::cv::gpu::device::bf_knnmatch; | ||||
|     using namespace cv::gpu::device::bf_knnmatch; | ||||
|  | ||||
|     typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, | ||||
|                              const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, | ||||
|                              int cc, cudaStream_t stream); | ||||
|                              cudaStream_t stream); | ||||
|  | ||||
|     static const caller_t callers[3][6] = | ||||
|     { | ||||
| @@ -621,10 +612,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Collection(const GpuMat& quer | ||||
|     caller_t func = callers[distType][query.depth()]; | ||||
|     CV_Assert(func != 0); | ||||
|  | ||||
|     DeviceInfo info; | ||||
|     int cc = info.majorVersion() * 10 + info.minorVersion(); | ||||
|  | ||||
|     func(query, trainCollection, maskCollection, trainIdx, imgIdx, distance, cc, StreamAccessor::getStream(stream)); | ||||
|     func(query, trainCollection, maskCollection, trainIdx, imgIdx, distance, StreamAccessor::getStream(stream)); | ||||
| } | ||||
|  | ||||
| void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Download(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, | ||||
| @@ -765,7 +753,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchSingle(const GpuMat& query, | ||||
|  | ||||
|     typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask, | ||||
|                              const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, | ||||
|                              int cc, cudaStream_t stream); | ||||
|                              cudaStream_t stream); | ||||
|  | ||||
|     static const caller_t callers[3][6] = | ||||
|     { | ||||
| @@ -786,12 +774,6 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchSingle(const GpuMat& query, | ||||
|         } | ||||
|     }; | ||||
|  | ||||
|     DeviceInfo info; | ||||
|     int cc = info.majorVersion() * 10 + info.minorVersion(); | ||||
|  | ||||
|     if (!TargetArchs::builtWith(GLOBAL_ATOMICS) || !DeviceInfo().supports(GLOBAL_ATOMICS)) | ||||
|         CV_Error(CV_StsNotImplemented, "The device doesn't support global atomics"); | ||||
|  | ||||
|     const int nQuery = query.rows; | ||||
|     const int nTrain = train.rows; | ||||
|  | ||||
| @@ -814,7 +796,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchSingle(const GpuMat& query, | ||||
|     caller_t func = callers[distType][query.depth()]; | ||||
|     CV_Assert(func != 0); | ||||
|  | ||||
|     func(query, train, maxDistance, mask, trainIdx, distance, nMatches, cc, StreamAccessor::getStream(stream)); | ||||
|     func(query, train, maxDistance, mask, trainIdx, distance, nMatches, StreamAccessor::getStream(stream)); | ||||
| } | ||||
|  | ||||
| void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& distance, const GpuMat& nMatches, | ||||
| @@ -897,7 +879,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchCollection(const GpuMat& qu | ||||
|  | ||||
|     typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, | ||||
|                              const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, | ||||
|                              int cc, cudaStream_t stream); | ||||
|                              cudaStream_t stream); | ||||
|  | ||||
|     static const caller_t callers[3][6] = | ||||
|     { | ||||
| @@ -918,12 +900,6 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchCollection(const GpuMat& qu | ||||
|         } | ||||
|     }; | ||||
|  | ||||
|     DeviceInfo info; | ||||
|     int cc = info.majorVersion() * 10 + info.minorVersion(); | ||||
|  | ||||
|     if (!TargetArchs::builtWith(GLOBAL_ATOMICS) || !DeviceInfo().supports(GLOBAL_ATOMICS)) | ||||
|         CV_Error(CV_StsNotImplemented, "The device doesn't support global atomics"); | ||||
|  | ||||
|     const int nQuery = query.rows; | ||||
|  | ||||
|     CV_Assert(query.channels() == 1 && query.depth() < CV_64F); | ||||
| @@ -949,7 +925,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchCollection(const GpuMat& qu | ||||
|     vector<PtrStepSzb> masks_(masks.begin(), masks.end()); | ||||
|  | ||||
|     func(query, &trains_[0], static_cast<int>(trains_.size()), maxDistance, masks_.size() == 0 ? 0 : &masks_[0], | ||||
|         trainIdx, imgIdx, distance, nMatches, cc, StreamAccessor::getStream(stream)); | ||||
|         trainIdx, imgIdx, distance, nMatches, StreamAccessor::getStream(stream)); | ||||
| } | ||||
|  | ||||
| void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, const GpuMat& nMatches, | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Vladislav Vinogradov
					Vladislav Vinogradov