fix BruteForceMatcher resource distribution
added launch bounds attributes for all CUDA kernels
This commit is contained in:
parent
17608f7ade
commit
d22516872c
@ -374,6 +374,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
|
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
|
||||||
|
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
|
||||||
__global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
|
__global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
|
||||||
{
|
{
|
||||||
extern __shared__ int smem[];
|
extern __shared__ int smem[];
|
||||||
@ -424,6 +425,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
|
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
|
||||||
|
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
|
||||||
__global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
|
__global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
|
||||||
{
|
{
|
||||||
extern __shared__ int smem[];
|
extern __shared__ int smem[];
|
||||||
@ -553,6 +555,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
|
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
|
||||||
|
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
|
||||||
__global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
|
__global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
|
||||||
{
|
{
|
||||||
extern __shared__ int smem[];
|
extern __shared__ int smem[];
|
||||||
@ -601,6 +604,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
|
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
|
||||||
|
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
|
||||||
__global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
|
__global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
|
||||||
{
|
{
|
||||||
extern __shared__ int smem[];
|
extern __shared__ int smem[];
|
||||||
@ -727,6 +731,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
|
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
|
||||||
|
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
|
||||||
__global__ void match(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
|
__global__ void match(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
|
||||||
{
|
{
|
||||||
extern __shared__ int smem[];
|
extern __shared__ int smem[];
|
||||||
@ -775,6 +780,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
|
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
|
||||||
|
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
|
||||||
__global__ void match(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
|
__global__ void match(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
|
||||||
{
|
{
|
||||||
extern __shared__ int smem[];
|
extern __shared__ int smem[];
|
||||||
@ -902,6 +908,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
// Calc distance kernel
|
// Calc distance kernel
|
||||||
|
|
||||||
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
|
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
|
||||||
|
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
|
||||||
__global__ void calcDistanceUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, PtrStepf allDist)
|
__global__ void calcDistanceUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, PtrStepf allDist)
|
||||||
{
|
{
|
||||||
extern __shared__ int smem[];
|
extern __shared__ int smem[];
|
||||||
@ -966,6 +973,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
|
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
|
||||||
|
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
|
||||||
__global__ void calcDistance(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, PtrStepf allDist)
|
__global__ void calcDistance(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, PtrStepf allDist)
|
||||||
{
|
{
|
||||||
extern __shared__ int smem[];
|
extern __shared__ int smem[];
|
||||||
@ -1066,6 +1074,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
// find knn match kernel
|
// find knn match kernel
|
||||||
|
|
||||||
template <int BLOCK_SIZE>
|
template <int BLOCK_SIZE>
|
||||||
|
__launch_bounds__(BLOCK_SIZE)
|
||||||
__global__ void findBestMatch(PtrStepSzf allDist, int i, PtrStepi trainIdx, PtrStepf distance)
|
__global__ void findBestMatch(PtrStepSzf allDist, int i, PtrStepi trainIdx, PtrStepf distance)
|
||||||
{
|
{
|
||||||
const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64;
|
const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64;
|
||||||
|
@ -136,6 +136,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
|
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
|
||||||
|
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
|
||||||
__global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
|
__global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
|
||||||
{
|
{
|
||||||
extern __shared__ int smem[];
|
extern __shared__ int smem[];
|
||||||
@ -184,6 +185,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
|
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
|
||||||
|
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
|
||||||
__global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask,
|
__global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask,
|
||||||
int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
|
int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
|
||||||
{
|
{
|
||||||
@ -296,6 +298,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
|
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
|
||||||
|
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
|
||||||
__global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
|
__global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
|
||||||
{
|
{
|
||||||
extern __shared__ int smem[];
|
extern __shared__ int smem[];
|
||||||
@ -342,6 +345,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
|
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
|
||||||
|
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
|
||||||
__global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask,
|
__global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask,
|
||||||
int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
|
int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
|
||||||
{
|
{
|
||||||
@ -451,6 +455,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
|
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
|
||||||
|
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
|
||||||
__global__ void match(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
|
__global__ void match(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
|
||||||
{
|
{
|
||||||
extern __shared__ int smem[];
|
extern __shared__ int smem[];
|
||||||
@ -497,6 +502,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
|
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
|
||||||
|
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
|
||||||
__global__ void match(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask,
|
__global__ void match(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask,
|
||||||
int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
|
int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
|
||||||
{
|
{
|
||||||
|
@ -56,6 +56,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
// Match Unrolled
|
// Match Unrolled
|
||||||
|
|
||||||
template <int BLOCK_SIZE, int MAX_DESC_LEN, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
|
template <int BLOCK_SIZE, int MAX_DESC_LEN, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
|
||||||
|
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
|
||||||
__global__ void matchUnrolled(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask,
|
__global__ void matchUnrolled(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask,
|
||||||
PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
|
PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
|
||||||
{
|
{
|
||||||
@ -164,6 +165,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
// Match
|
// Match
|
||||||
|
|
||||||
template <int BLOCK_SIZE, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
|
template <int BLOCK_SIZE, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
|
||||||
|
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
|
||||||
__global__ void match(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask,
|
__global__ void match(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask,
|
||||||
PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
|
PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
|
||||||
{
|
{
|
||||||
|
Loading…
x
Reference in New Issue
Block a user