added support of Hamming distance to BruteForceMatcher_GPU

This commit is contained in:
Vladislav Vinogradov 2011-05-16 08:38:27 +00:00
parent 79ed4e4c92
commit f11efdced3
3 changed files with 335 additions and 166 deletions

View File

@ -1382,7 +1382,7 @@ namespace cv
class CV_EXPORTS BruteForceMatcher_GPU_base class CV_EXPORTS BruteForceMatcher_GPU_base
{ {
public: public:
enum DistType {L1Dist = 0, L2Dist}; enum DistType {L1Dist = 0, L2Dist, HammingDist};
explicit BruteForceMatcher_GPU_base(DistType distType = L2Dist); explicit BruteForceMatcher_GPU_base(DistType distType = L2Dist);
@ -1522,6 +1522,18 @@ namespace cv
explicit BruteForceMatcher_GPU() : BruteForceMatcher_GPU_base(L2Dist) {} explicit BruteForceMatcher_GPU() : BruteForceMatcher_GPU_base(L2Dist) {}
explicit BruteForceMatcher_GPU(L2<T> /*d*/) : BruteForceMatcher_GPU_base(L2Dist) {} explicit BruteForceMatcher_GPU(L2<T> /*d*/) : BruteForceMatcher_GPU_base(L2Dist) {}
}; };
template <> class CV_EXPORTS BruteForceMatcher_GPU< HammingLUT > : public BruteForceMatcher_GPU_base
{
public:
explicit BruteForceMatcher_GPU() : BruteForceMatcher_GPU_base(HammingDist) {}
explicit BruteForceMatcher_GPU(HammingLUT /*d*/) : BruteForceMatcher_GPU_base(HammingDist) {}
};
template <> class CV_EXPORTS BruteForceMatcher_GPU< Hamming > : public BruteForceMatcher_GPU_base
{
public:
explicit BruteForceMatcher_GPU() : BruteForceMatcher_GPU_base(HammingDist) {}
explicit BruteForceMatcher_GPU(Hamming /*d*/) : BruteForceMatcher_GPU_base(HammingDist) {}
};
////////////////////////////////// CascadeClassifier_GPU ////////////////////////////////////////// ////////////////////////////////// CascadeClassifier_GPU //////////////////////////////////////////
// The cascade classifier class for object detection. // The cascade classifier class for object detection.

View File

@ -83,14 +83,20 @@ namespace cv { namespace gpu { namespace bfmatcher
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
bool cc_12); bool cc_12);
template <typename T> template <typename T>
void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs,
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
bool cc_12);
template <typename T>
void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection,
const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
const DevMem2Df& distance,
bool cc_12); bool cc_12);
template <typename T> template <typename T>
void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection,
const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
const DevMem2Df& distance, bool cc_12);
template <typename T>
void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection,
const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
bool cc_12); bool cc_12);
template <typename T> template <typename T>
@ -99,6 +105,9 @@ namespace cv { namespace gpu { namespace bfmatcher
template <typename T> template <typename T>
void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template <typename T>
void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template <typename T> template <typename T>
void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,
@ -106,6 +115,9 @@ namespace cv { namespace gpu { namespace bfmatcher
template <typename T> template <typename T>
void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,
const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template <typename T>
void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,
const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
}}} }}}
namespace namespace
@ -167,7 +179,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& queryDescs,
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
bool cc_12); bool cc_12);
static const match_caller_t match_callers[2][8] = static const match_caller_t match_callers[3][8] =
{ {
{ {
matchSingleL1_gpu<unsigned char>, matchSingleL1_gpu<signed char>, matchSingleL1_gpu<unsigned char>, matchSingleL1_gpu<signed char>,
@ -178,6 +190,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& queryDescs,
matchSingleL2_gpu<unsigned char>, matchSingleL2_gpu<signed char>, matchSingleL2_gpu<unsigned char>, matchSingleL2_gpu<signed char>,
matchSingleL2_gpu<unsigned short>, matchSingleL2_gpu<short>, matchSingleL2_gpu<unsigned short>, matchSingleL2_gpu<short>,
matchSingleL2_gpu<int>, matchSingleL2_gpu<float>, 0, 0 matchSingleL2_gpu<int>, matchSingleL2_gpu<float>, 0, 0
},
{
matchSingleHamming_gpu<unsigned char>, matchSingleHamming_gpu<signed char>,
matchSingleHamming_gpu<unsigned short>, matchSingleHamming_gpu<short>,
matchSingleHamming_gpu<int>, 0, 0, 0
} }
}; };
@ -295,7 +312,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& queryDes
const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx,
const DevMem2Df& distance, bool cc_12); const DevMem2Df& distance, bool cc_12);
static const match_caller_t match_callers[2][8] = static const match_caller_t match_callers[3][8] =
{ {
{ {
matchCollectionL1_gpu<unsigned char>, matchCollectionL1_gpu<signed char>, matchCollectionL1_gpu<unsigned char>, matchCollectionL1_gpu<signed char>,
@ -306,6 +323,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& queryDes
matchCollectionL2_gpu<unsigned char>, matchCollectionL2_gpu<signed char>, matchCollectionL2_gpu<unsigned char>, matchCollectionL2_gpu<signed char>,
matchCollectionL2_gpu<unsigned short>, matchCollectionL2_gpu<short>, matchCollectionL2_gpu<unsigned short>, matchCollectionL2_gpu<short>,
matchCollectionL2_gpu<int>, matchCollectionL2_gpu<float>, 0, 0 matchCollectionL2_gpu<int>, matchCollectionL2_gpu<float>, 0, 0
},
{
matchCollectionHamming_gpu<unsigned char>, matchCollectionHamming_gpu<signed char>,
matchCollectionHamming_gpu<unsigned short>, matchCollectionHamming_gpu<short>,
matchCollectionHamming_gpu<int>, 0, 0, 0
} }
}; };
@ -391,7 +413,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, con
typedef void (*match_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, typedef void (*match_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
static const match_caller_t match_callers[2][8] = static const match_caller_t match_callers[3][8] =
{ {
{ {
knnMatchL1_gpu<unsigned char>, knnMatchL1_gpu<signed char>, knnMatchL1_gpu<unsigned short>, knnMatchL1_gpu<unsigned char>, knnMatchL1_gpu<signed char>, knnMatchL1_gpu<unsigned short>,
@ -400,6 +422,10 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, con
{ {
knnMatchL2_gpu<unsigned char>, knnMatchL2_gpu<signed char>, knnMatchL2_gpu<unsigned short>, knnMatchL2_gpu<unsigned char>, knnMatchL2_gpu<signed char>, knnMatchL2_gpu<unsigned short>,
knnMatchL2_gpu<short>, knnMatchL2_gpu<int>, knnMatchL2_gpu<float>, 0, 0 knnMatchL2_gpu<short>, knnMatchL2_gpu<int>, knnMatchL2_gpu<float>, 0, 0
},
{
knnMatchHamming_gpu<unsigned char>, knnMatchHamming_gpu<signed char>, knnMatchHamming_gpu<unsigned short>,
knnMatchHamming_gpu<short>, knnMatchHamming_gpu<int>, 0, 0, 0
} }
}; };
@ -531,7 +557,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs,
typedef void (*radiusMatch_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, typedef void (*radiusMatch_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,
const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
static const radiusMatch_caller_t radiusMatch_callers[2][8] = static const radiusMatch_caller_t radiusMatch_callers[3][8] =
{ {
{ {
radiusMatchL1_gpu<unsigned char>, radiusMatchL1_gpu<signed char>, radiusMatchL1_gpu<unsigned short>, radiusMatchL1_gpu<unsigned char>, radiusMatchL1_gpu<signed char>, radiusMatchL1_gpu<unsigned short>,
@ -540,6 +566,10 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs,
{ {
radiusMatchL2_gpu<unsigned char>, radiusMatchL2_gpu<signed char>, radiusMatchL2_gpu<unsigned short>, radiusMatchL2_gpu<unsigned char>, radiusMatchL2_gpu<signed char>, radiusMatchL2_gpu<unsigned short>,
radiusMatchL2_gpu<short>, radiusMatchL2_gpu<int>, radiusMatchL2_gpu<float>, 0, 0 radiusMatchL2_gpu<short>, radiusMatchL2_gpu<int>, radiusMatchL2_gpu<float>, 0, 0
},
{
radiusMatchHamming_gpu<unsigned char>, radiusMatchHamming_gpu<signed char>, radiusMatchHamming_gpu<unsigned short>,
radiusMatchHamming_gpu<short>, radiusMatchHamming_gpu<int>, 0, 0, 0
} }
}; };

View File

@ -103,30 +103,61 @@ namespace cv { namespace gpu { namespace bfmatcher
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// Reduce Sum // Reduce Sum
template <int BLOCK_DIM_X> __device__ void reduceSum(float* sdiff_row, float& mySum);
template <> __device__ void reduceSum<16>(float* sdiff_row, float& mySum) template <int BLOCK_DIM_X> struct SumReductor;
template <> struct SumReductor<16>
{ {
volatile float* smem = sdiff_row; template <typename T> static __device__ void reduce(T* sdiff_row, T& mySum)
smem[threadIdx.x] = mySum;
if (threadIdx.x < 8)
{ {
smem[threadIdx.x] = mySum += smem[threadIdx.x + 8]; volatile T* smem = sdiff_row;
smem[threadIdx.x] = mySum += smem[threadIdx.x + 4];
smem[threadIdx.x] = mySum += smem[threadIdx.x + 2]; smem[threadIdx.x] = mySum;
smem[threadIdx.x] = mySum += smem[threadIdx.x + 1];
if (threadIdx.x < 8)
{
smem[threadIdx.x] = mySum += smem[threadIdx.x + 8];
smem[threadIdx.x] = mySum += smem[threadIdx.x + 4];
smem[threadIdx.x] = mySum += smem[threadIdx.x + 2];
smem[threadIdx.x] = mySum += smem[threadIdx.x + 1];
}
} }
} };
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// Distance // Distance
class L1Dist template <typename T> class L1Dist
{ {
public: public:
typedef int ResultType;
typedef int ValueType;
__device__ L1Dist() : mySum(0) {}
__device__ void reduceIter(int val1, int val2)
{
mySum = __sad(val1, val2, mySum);
}
template <int BLOCK_DIM_X> __device__ void reduceAll(int* sdiff_row)
{
SumReductor<BLOCK_DIM_X>::reduce(sdiff_row, mySum);
}
__device__ operator int() const
{
return mySum;
}
private:
int mySum;
};
template <> class L1Dist<float>
{
public:
typedef float ResultType;
typedef float ValueType;
__device__ L1Dist() : mySum(0.0f) {} __device__ L1Dist() : mySum(0.0f) {}
__device__ void reduceIter(float val1, float val2) __device__ void reduceIter(float val1, float val2)
@ -134,10 +165,9 @@ namespace cv { namespace gpu { namespace bfmatcher
mySum += fabs(val1 - val2); mySum += fabs(val1 - val2);
} }
template <int BLOCK_DIM_X> template <int BLOCK_DIM_X> __device__ void reduceAll(float* sdiff_row)
__device__ void reduceAll(float* sdiff_row)
{ {
reduceSum<BLOCK_DIM_X>(sdiff_row, mySum); SumReductor<BLOCK_DIM_X>::reduce(sdiff_row, mySum);
} }
__device__ operator float() const __device__ operator float() const
@ -152,6 +182,9 @@ namespace cv { namespace gpu { namespace bfmatcher
class L2Dist class L2Dist
{ {
public: public:
typedef float ResultType;
typedef float ValueType;
__device__ L2Dist() : mySum(0.0f) {} __device__ L2Dist() : mySum(0.0f) {}
__device__ void reduceIter(float val1, float val2) __device__ void reduceIter(float val1, float val2)
@ -160,10 +193,9 @@ namespace cv { namespace gpu { namespace bfmatcher
mySum += reg * reg; mySum += reg * reg;
} }
template <int BLOCK_DIM_X> template <int BLOCK_DIM_X> __device__ void reduceAll(float* sdiff_row)
__device__ void reduceAll(float* sdiff_row)
{ {
reduceSum<BLOCK_DIM_X>(sdiff_row, mySum); SumReductor<BLOCK_DIM_X>::reduce(sdiff_row, mySum);
} }
__device__ operator float() const __device__ operator float() const
@ -174,13 +206,39 @@ namespace cv { namespace gpu { namespace bfmatcher
private: private:
float mySum; float mySum;
}; };
class HammingDist
{
public:
typedef int ResultType;
typedef int ValueType;
__device__ HammingDist() : mySum(0) {}
__device__ void reduceIter(int val1, int val2)
{
mySum += __popc(val1 ^ val2);
}
template <int BLOCK_DIM_X> __device__ void reduceAll(int* sdiff_row)
{
SumReductor<BLOCK_DIM_X>::reduce(sdiff_row, mySum);
}
__device__ operator int() const
{
return mySum;
}
private:
int mySum;
};
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// reduceDescDiff // reduceDescDiff
template <int BLOCK_DIM_X, typename Dist, typename T> template <int BLOCK_DIM_X, typename Dist, typename T>
__device__ void reduceDescDiff(const T* queryDescs, const T* trainDescs, int desc_len, Dist& dist, __device__ void reduceDescDiff(const T* queryDescs, const T* trainDescs, int desc_len, Dist& dist, typename Dist::ResultType* sdiff_row)
float* sdiff_row)
{ {
for (int i = threadIdx.x; i < desc_len; i += BLOCK_DIM_X) for (int i = threadIdx.x; i < desc_len; i += BLOCK_DIM_X)
dist.reduceIter(queryDescs[i], trainDescs[i]); dist.reduceIter(queryDescs[i], trainDescs[i]);
@ -195,14 +253,14 @@ namespace cv { namespace gpu { namespace bfmatcher
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// loadDescsVals // loadDescsVals
template <int BLOCK_DIM_X, int MAX_DESCRIPTORS_LEN, typename T> template <int BLOCK_DIM_X, int MAX_DESCRIPTORS_LEN, typename T, typename U>
__device__ void loadDescsVals(const T* descs, int desc_len, float* queryVals, float* smem) __device__ void loadDescsVals(const T* descs, int desc_len, U* queryVals, U* smem)
{ {
const int tid = threadIdx.y * blockDim.x + threadIdx.x; const int tid = threadIdx.y * blockDim.x + threadIdx.x;
if (tid < desc_len) if (tid < desc_len)
{ {
smem[tid] = (float)descs[tid]; smem[tid] = descs[tid];
} }
__syncthreads(); __syncthreads();
@ -220,8 +278,7 @@ namespace cv { namespace gpu { namespace bfmatcher
template <int N> struct UnrollDescDiff template <int N> struct UnrollDescDiff
{ {
template <typename Dist, typename T> template <typename Dist, typename T>
static __device__ void calcCheck(const float* queryVals, const T* trainDescs, int desc_len, static __device__ void calcCheck(const typename Dist::ValueType* queryVals, const T* trainDescs, int desc_len, Dist& dist, int ind)
Dist& dist, int ind)
{ {
if (ind < desc_len) if (ind < desc_len)
{ {
@ -234,7 +291,7 @@ namespace cv { namespace gpu { namespace bfmatcher
} }
template <typename Dist, typename T> template <typename Dist, typename T>
static __device__ void calcWithoutCheck(const float* queryVals, const T* trainDescs, Dist& dist) static __device__ void calcWithoutCheck(const typename Dist::ValueType* queryVals, const T* trainDescs, Dist& dist)
{ {
dist.reduceIter(*queryVals, *trainDescs); dist.reduceIter(*queryVals, *trainDescs);
@ -247,13 +304,13 @@ namespace cv { namespace gpu { namespace bfmatcher
template <> struct UnrollDescDiff<0> template <> struct UnrollDescDiff<0>
{ {
template <typename Dist, typename T> template <typename Dist, typename T>
static __device__ void calcCheck(const float* queryVals, const T* trainDescs, int desc_len, static __device__ void calcCheck(const typename Dist::ValueType* queryVals, const T* trainDescs, int desc_len,
Dist& dist, int ind) Dist& dist, int ind)
{ {
} }
template <typename Dist, typename T> template <typename Dist, typename T>
static __device__ void calcWithoutCheck(const float* queryVals, const T* trainDescs, Dist& dist) static __device__ void calcWithoutCheck(const typename Dist::ValueType* queryVals, const T* trainDescs, Dist& dist)
{ {
} }
}; };
@ -263,29 +320,25 @@ namespace cv { namespace gpu { namespace bfmatcher
struct DescDiffCalculator<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, false> struct DescDiffCalculator<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, false>
{ {
template <typename Dist, typename T> template <typename Dist, typename T>
static __device__ void calc(const float* queryVals, const T* trainDescs, int desc_len, Dist& dist) static __device__ void calc(const typename Dist::ValueType* queryVals, const T* trainDescs, int desc_len, Dist& dist)
{ {
UnrollDescDiff<MAX_DESCRIPTORS_LEN / BLOCK_DIM_X>::calcCheck(queryVals, trainDescs, desc_len, UnrollDescDiff<MAX_DESCRIPTORS_LEN / BLOCK_DIM_X>::calcCheck(queryVals, trainDescs, desc_len, dist, threadIdx.x);
dist, threadIdx.x);
} }
}; };
template <int BLOCK_DIM_X, int MAX_DESCRIPTORS_LEN> template <int BLOCK_DIM_X, int MAX_DESCRIPTORS_LEN>
struct DescDiffCalculator<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, true> struct DescDiffCalculator<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, true>
{ {
template <typename Dist, typename T> template <typename Dist, typename T>
static __device__ void calc(const float* queryVals, const T* trainDescs, int desc_len, Dist& dist) static __device__ void calc(const typename Dist::ValueType* queryVals, const T* trainDescs, int desc_len, Dist& dist)
{ {
UnrollDescDiff<MAX_DESCRIPTORS_LEN / BLOCK_DIM_X>::calcWithoutCheck(queryVals, UnrollDescDiff<MAX_DESCRIPTORS_LEN / BLOCK_DIM_X>::calcWithoutCheck(queryVals, trainDescs + threadIdx.x, dist);
trainDescs + threadIdx.x, dist);
} }
}; };
template <int BLOCK_DIM_X, int MAX_DESCRIPTORS_LEN, bool DESC_LEN_EQ_MAX_LEN, typename Dist, typename T> template <int BLOCK_DIM_X, int MAX_DESCRIPTORS_LEN, bool DESC_LEN_EQ_MAX_LEN, typename Dist, typename T>
__device__ void reduceDescDiffCached(const float* queryVals, const T* trainDescs, int desc_len, Dist& dist, __device__ void reduceDescDiffCached(const typename Dist::ValueType* queryVals, const T* trainDescs, int desc_len, Dist& dist, typename Dist::ResultType* sdiff_row)
float* sdiff_row)
{ {
DescDiffCalculator<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, DESC_LEN_EQ_MAX_LEN>::calc(queryVals, DescDiffCalculator<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, DESC_LEN_EQ_MAX_LEN>::calc(queryVals, trainDescs, desc_len, dist);
trainDescs, desc_len, dist);
dist.reduceAll<BLOCK_DIM_X>(sdiff_row); dist.reduceAll<BLOCK_DIM_X>(sdiff_row);
} }
@ -293,62 +346,60 @@ namespace cv { namespace gpu { namespace bfmatcher
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// warpReduceMinIdxIdx // warpReduceMinIdxIdx
template <int BLOCK_DIM_Y> template <int BLOCK_DIM_Y> struct MinIdxIdxWarpReductor;
__device__ void warpReduceMinIdxIdx(float& myMin, int& myBestTrainIdx, int& myBestImgIdx, template <> struct MinIdxIdxWarpReductor<16>
volatile float* sdata, volatile int* strainIdx, volatile int* simgIdx);
template <>
__device__ void warpReduceMinIdxIdx<16>(float& myMin, int& myBestTrainIdx, int& myBestImgIdx,
volatile float* smin, volatile int* strainIdx, volatile int* simgIdx)
{ {
const int tid = threadIdx.y * blockDim.x + threadIdx.x; template <typename T>
static __device__ void reduce(T& myMin, int& myBestTrainIdx, int& myBestImgIdx, volatile T* smin, volatile int* strainIdx, volatile int* simgIdx)
if (tid < 8)
{ {
myMin = smin[tid]; const int tid = threadIdx.y * blockDim.x + threadIdx.x;
myBestTrainIdx = strainIdx[tid];
myBestImgIdx = simgIdx[tid];
float reg = smin[tid + 8]; if (tid < 8)
if (reg < myMin)
{ {
smin[tid] = myMin = reg; myMin = smin[tid];
strainIdx[tid] = myBestTrainIdx = strainIdx[tid + 8]; myBestTrainIdx = strainIdx[tid];
simgIdx[tid] = myBestImgIdx = simgIdx[tid + 8]; myBestImgIdx = simgIdx[tid];
}
reg = smin[tid + 4]; float reg = smin[tid + 8];
if (reg < myMin) if (reg < myMin)
{ {
smin[tid] = myMin = reg; smin[tid] = myMin = reg;
strainIdx[tid] = myBestTrainIdx = strainIdx[tid + 4]; strainIdx[tid] = myBestTrainIdx = strainIdx[tid + 8];
simgIdx[tid] = myBestImgIdx = simgIdx[tid + 4]; simgIdx[tid] = myBestImgIdx = simgIdx[tid + 8];
} }
reg = smin[tid + 2]; reg = smin[tid + 4];
if (reg < myMin) if (reg < myMin)
{ {
smin[tid] = myMin = reg; smin[tid] = myMin = reg;
strainIdx[tid] = myBestTrainIdx = strainIdx[tid + 2]; strainIdx[tid] = myBestTrainIdx = strainIdx[tid + 4];
simgIdx[tid] = myBestImgIdx = simgIdx[tid + 2]; simgIdx[tid] = myBestImgIdx = simgIdx[tid + 4];
} }
reg = smin[tid + 1]; reg = smin[tid + 2];
if (reg < myMin) if (reg < myMin)
{ {
smin[tid] = myMin = reg; smin[tid] = myMin = reg;
strainIdx[tid] = myBestTrainIdx = strainIdx[tid + 1]; strainIdx[tid] = myBestTrainIdx = strainIdx[tid + 2];
simgIdx[tid] = myBestImgIdx = simgIdx[tid + 1]; simgIdx[tid] = myBestImgIdx = simgIdx[tid + 2];
}
reg = smin[tid + 1];
if (reg < myMin)
{
smin[tid] = myMin = reg;
strainIdx[tid] = myBestTrainIdx = strainIdx[tid + 1];
simgIdx[tid] = myBestImgIdx = simgIdx[tid + 1];
}
} }
} }
} };
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// findBestMatch // findBestMatch
template <int BLOCK_DIM_Y> template <int BLOCK_DIM_Y, typename T>
__device__ void findBestMatch(float& myMin, int& myBestTrainIdx, int& myBestImgIdx, __device__ void findBestMatch(T& myMin, int& myBestTrainIdx, int& myBestImgIdx, T* smin, int* strainIdx, int* simgIdx)
float* smin, int* strainIdx, int* simgIdx)
{ {
if (threadIdx.x == 0) if (threadIdx.x == 0)
{ {
@ -358,7 +409,7 @@ namespace cv { namespace gpu { namespace bfmatcher
} }
__syncthreads(); __syncthreads();
warpReduceMinIdxIdx<BLOCK_DIM_Y>(myMin, myBestTrainIdx, myBestImgIdx, smin, strainIdx, simgIdx); MinIdxIdxWarpReductor<BLOCK_DIM_Y>::reduce(myMin, myBestTrainIdx, myBestImgIdx, smin, strainIdx, simgIdx);
} }
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
@ -368,13 +419,13 @@ namespace cv { namespace gpu { namespace bfmatcher
class ReduceDescCalculatorSimple class ReduceDescCalculatorSimple
{ {
public: public:
__device__ void prepare(const T* queryDescs_, int, float*) __device__ void prepare(const T* queryDescs_, int, void*)
{ {
queryDescs = queryDescs_; queryDescs = queryDescs_;
} }
template <typename Dist> template <typename Dist>
__device__ void calc(const T* trainDescs, int desc_len, Dist& dist, float* sdiff_row) const __device__ void calc(const T* trainDescs, int desc_len, Dist& dist, typename Dist::ResultType* sdiff_row) const
{ {
reduceDescDiff<BLOCK_DIM_X>(queryDescs, trainDescs, desc_len, dist, sdiff_row); reduceDescDiff<BLOCK_DIM_X>(queryDescs, trainDescs, desc_len, dist, sdiff_row);
} }
@ -383,24 +434,23 @@ namespace cv { namespace gpu { namespace bfmatcher
const T* queryDescs; const T* queryDescs;
}; };
template <int BLOCK_DIM_X, int MAX_DESCRIPTORS_LEN, bool DESC_LEN_EQ_MAX_LEN, typename T> template <int BLOCK_DIM_X, int MAX_DESCRIPTORS_LEN, bool DESC_LEN_EQ_MAX_LEN, typename T, typename U>
class ReduceDescCalculatorCached class ReduceDescCalculatorCached
{ {
public: public:
__device__ void prepare(const T* queryDescs, int desc_len, float* smem) __device__ void prepare(const T* queryDescs, int desc_len, U* smem)
{ {
loadDescsVals<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN>(queryDescs, desc_len, queryVals, smem); loadDescsVals<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN>(queryDescs, desc_len, queryVals, smem);
} }
template <typename Dist> template <typename Dist>
__device__ void calc(const T* trainDescs, int desc_len, Dist& dist, float* sdiff_row) const __device__ void calc(const T* trainDescs, int desc_len, Dist& dist, typename Dist::ResultType* sdiff_row) const
{ {
reduceDescDiffCached<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, DESC_LEN_EQ_MAX_LEN>(queryVals, trainDescs, reduceDescDiffCached<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, DESC_LEN_EQ_MAX_LEN>(queryVals, trainDescs, desc_len, dist, sdiff_row);
desc_len, dist, sdiff_row);
} }
private: private:
float queryVals[MAX_DESCRIPTORS_LEN / BLOCK_DIM_X]; U queryVals[MAX_DESCRIPTORS_LEN / BLOCK_DIM_X];
}; };
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
@ -409,7 +459,7 @@ namespace cv { namespace gpu { namespace bfmatcher
template <typename Dist, typename ReduceDescCalculator, typename T, typename Mask> template <typename Dist, typename ReduceDescCalculator, typename T, typename Mask>
__device__ void matchDescs(int queryIdx, int imgIdx, const DevMem2D_<T>& trainDescs_, __device__ void matchDescs(int queryIdx, int imgIdx, const DevMem2D_<T>& trainDescs_,
const Mask& m, const ReduceDescCalculator& reduceDescCalc, const Mask& m, const ReduceDescCalculator& reduceDescCalc,
float& myMin, int& myBestTrainIdx, int& myBestImgIdx, float* sdiff_row) typename Dist::ResultType& myMin, int& myBestTrainIdx, int& myBestImgIdx, typename Dist::ResultType* sdiff_row)
{ {
for (int trainIdx = threadIdx.y; trainIdx < trainDescs_.rows; trainIdx += blockDim.y) for (int trainIdx = threadIdx.y; trainIdx < trainDescs_.rows; trainIdx += blockDim.y)
{ {
@ -447,10 +497,9 @@ namespace cv { namespace gpu { namespace bfmatcher
template <typename Dist, typename ReduceDescCalculator, typename Mask> template <typename Dist, typename ReduceDescCalculator, typename Mask>
__device__ void loop(int queryIdx, Mask& m, const ReduceDescCalculator& reduceDescCalc, __device__ void loop(int queryIdx, Mask& m, const ReduceDescCalculator& reduceDescCalc,
float& myMin, int& myBestTrainIdx, int& myBestImgIdx, float* sdiff_row) const typename Dist::ResultType& myMin, int& myBestTrainIdx, int& myBestImgIdx, typename Dist::ResultType* sdiff_row) const
{ {
matchDescs<Dist>(queryIdx, 0, trainDescs, m, reduceDescCalc, matchDescs<Dist>(queryIdx, 0, trainDescs, m, reduceDescCalc, myMin, myBestTrainIdx, myBestImgIdx, sdiff_row);
myMin, myBestTrainIdx, myBestImgIdx, sdiff_row);
} }
__device__ int desc_len() const __device__ int desc_len() const
@ -473,14 +522,13 @@ namespace cv { namespace gpu { namespace bfmatcher
template <typename Dist, typename ReduceDescCalculator, typename Mask> template <typename Dist, typename ReduceDescCalculator, typename Mask>
__device__ void loop(int queryIdx, Mask& m, const ReduceDescCalculator& reduceDescCalc, __device__ void loop(int queryIdx, Mask& m, const ReduceDescCalculator& reduceDescCalc,
float& myMin, int& myBestTrainIdx, int& myBestImgIdx, float* sdiff_row) const typename Dist::ResultType& myMin, int& myBestTrainIdx, int& myBestImgIdx, typename Dist::ResultType* sdiff_row) const
{ {
for (int imgIdx = 0; imgIdx < nImg; ++imgIdx) for (int imgIdx = 0; imgIdx < nImg; ++imgIdx)
{ {
DevMem2D_<T> trainDescs = trainCollection[imgIdx]; DevMem2D_<T> trainDescs = trainCollection[imgIdx];
m.nextMask(); m.nextMask();
matchDescs<Dist>(queryIdx, imgIdx, trainDescs, m, reduceDescCalc, matchDescs<Dist>(queryIdx, imgIdx, trainDescs, m, reduceDescCalc, myMin, myBestTrainIdx, myBestImgIdx, sdiff_row);
myMin, myBestTrainIdx, myBestImgIdx, sdiff_row);
} }
} }
@ -498,38 +546,35 @@ namespace cv { namespace gpu { namespace bfmatcher
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// Match kernel // Match kernel
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename ReduceDescCalculator, typename Dist, typename T, template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename ReduceDescCalculator, typename Dist, typename T, typename Train, typename Mask>
typename Train, typename Mask> __global__ void match(const PtrStep_<T> queryDescs_, const Train train, const Mask mask, int* trainIdx, int* imgIdx, float* distance)
__global__ void match(const PtrStep_<T> queryDescs_, const Train train, const Mask mask,
int* trainIdx, int* imgIdx, float* distance)
{ {
__shared__ float smem[BLOCK_DIM_X * BLOCK_DIM_Y]; __shared__ typename Dist::ResultType smem[BLOCK_DIM_X * BLOCK_DIM_Y];
const int queryIdx = blockIdx.x; const int queryIdx = blockIdx.x;
int myBestTrainIdx = -1; int myBestTrainIdx = -1;
int myBestImgIdx = -1; int myBestImgIdx = -1;
float myMin = numeric_limits_gpu<float>::max(); typename Dist::ResultType myMin = numeric_limits_gpu<typename Dist::ResultType>::max();
{ {
float* sdiff_row = smem + BLOCK_DIM_X * threadIdx.y; typename Dist::ResultType* sdiff_row = smem + BLOCK_DIM_X * threadIdx.y;
Mask m = mask; Mask m = mask;
ReduceDescCalculator reduceDescCalc; ReduceDescCalculator reduceDescCalc;
reduceDescCalc.prepare(queryDescs_.ptr(queryIdx), train.desc_len(), smem); reduceDescCalc.prepare(queryDescs_.ptr(queryIdx), train.desc_len(), (typename Dist::ValueType*)smem);
train.template loop<Dist>(queryIdx, m, reduceDescCalc, myMin, myBestTrainIdx, myBestImgIdx, sdiff_row); train.template loop<Dist>(queryIdx, m, reduceDescCalc, myMin, myBestTrainIdx, myBestImgIdx, sdiff_row);
} }
__syncthreads(); __syncthreads();
float* smin = smem; typename Dist::ResultType* smin = smem;
int* strainIdx = (int*)(smin + BLOCK_DIM_Y); int* strainIdx = (int*)(smin + BLOCK_DIM_Y);
int* simgIdx = strainIdx + BLOCK_DIM_Y; int* simgIdx = strainIdx + BLOCK_DIM_Y;
findBestMatch<BLOCK_DIM_Y>(myMin, myBestTrainIdx, myBestImgIdx, findBestMatch<BLOCK_DIM_Y>(myMin, myBestTrainIdx, myBestImgIdx, smin, strainIdx, simgIdx);
smin, strainIdx, simgIdx);
if (threadIdx.x == 0 && threadIdx.y == 0) if (threadIdx.x == 0 && threadIdx.y == 0)
{ {
@ -542,8 +587,7 @@ namespace cv { namespace gpu { namespace bfmatcher
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// Match kernel callers // Match kernel callers
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Train, typename Mask>
typename Train, typename Mask>
void matchSimple_caller(const DevMem2D_<T>& queryDescs, const Train& train, void matchSimple_caller(const DevMem2D_<T>& queryDescs, const Train& train,
const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance) const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)
{ {
@ -553,14 +597,12 @@ namespace cv { namespace gpu { namespace bfmatcher
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
match<BLOCK_DIM_X, BLOCK_DIM_Y, ReduceDescCalculatorSimple<BLOCK_DIM_X, T>, Dist, T> match<BLOCK_DIM_X, BLOCK_DIM_Y, ReduceDescCalculatorSimple<BLOCK_DIM_X, T>, Dist, T>
<<<grid, threads>>>(queryDescs, train, mask, trainIdx.data, <<<grid, threads>>>(queryDescs, train, mask, trainIdx.data, imgIdx.data, distance.data);
imgIdx.data, distance.data);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
} }
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int MAX_DESCRIPTORS_LEN, bool DESC_LEN_EQ_MAX_LEN, template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int MAX_DESCRIPTORS_LEN, bool DESC_LEN_EQ_MAX_LEN, typename Dist, typename T, typename Train, typename Mask>
typename Dist, typename T, typename Train, typename Mask>
void matchCached_caller(const DevMem2D_<T>& queryDescs, const Train& train, void matchCached_caller(const DevMem2D_<T>& queryDescs, const Train& train,
const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance) const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)
{ {
@ -571,11 +613,8 @@ namespace cv { namespace gpu { namespace bfmatcher
dim3 grid(queryDescs.rows, 1, 1); dim3 grid(queryDescs.rows, 1, 1);
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
match<BLOCK_DIM_X, BLOCK_DIM_Y, match<BLOCK_DIM_X, BLOCK_DIM_Y, ReduceDescCalculatorCached<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, DESC_LEN_EQ_MAX_LEN, T, typename Dist::ValueType>, Dist, T>
ReduceDescCalculatorCached<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, DESC_LEN_EQ_MAX_LEN, T>, <<<grid, threads>>>(queryDescs, train, mask, trainIdx.data, imgIdx.data, distance.data);
Dist, T>
<<<grid, threads>>>(queryDescs, train, mask, trainIdx.data,
imgIdx.data, distance.data);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
@ -616,11 +655,11 @@ namespace cv { namespace gpu { namespace bfmatcher
if (mask.data) if (mask.data)
{ {
SingleMask m(mask); SingleMask m(mask);
matchDispatcher<L1Dist>((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12); matchDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12);
} }
else else
{ {
matchDispatcher<L1Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12); matchDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);
} }
} }
@ -655,6 +694,29 @@ namespace cv { namespace gpu { namespace bfmatcher
template void matchSingleL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); template void matchSingleL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
template void matchSingleL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); template void matchSingleL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
template <typename T>
void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs,
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
bool cc_12)
{
SingleTrain<T> train((DevMem2D_<T>)trainDescs);
if (mask.data)
{
SingleMask m(mask);
matchDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12);
}
else
{
matchDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);
}
}
template void matchSingleHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
template void matchSingleHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
template void matchSingleHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
template void matchSingleHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
template void matchSingleHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
template <typename T> template <typename T>
void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection,
const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx,
@ -664,11 +726,11 @@ namespace cv { namespace gpu { namespace bfmatcher
if (maskCollection.data) if (maskCollection.data)
{ {
MaskCollection mask(maskCollection.data); MaskCollection mask(maskCollection.data);
matchDispatcher<L1Dist>((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12); matchDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12);
} }
else else
{ {
matchDispatcher<L1Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12); matchDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);
} }
} }
@ -702,6 +764,29 @@ namespace cv { namespace gpu { namespace bfmatcher
template void matchCollectionL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); template void matchCollectionL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
template void matchCollectionL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); template void matchCollectionL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
template void matchCollectionL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); template void matchCollectionL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
template <typename T>
void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection,
const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx,
const DevMem2Df& distance, bool cc_12)
{
TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, queryDescs.cols);
if (maskCollection.data)
{
MaskCollection mask(maskCollection.data);
matchDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12);
}
else
{
matchDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);
}
}
template void matchCollectionHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
template void matchCollectionHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
template void matchCollectionHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
template void matchCollectionHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
template void matchCollectionHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
/////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////// Knn Match //////////////////////////////////// //////////////////////////////////// Knn Match ////////////////////////////////////
@ -713,9 +798,9 @@ namespace cv { namespace gpu { namespace bfmatcher
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask> template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
__global__ void calcDistance(PtrStep_<T> queryDescs_, DevMem2D_<T> trainDescs_, Mask mask, PtrStepf distance) __global__ void calcDistance(PtrStep_<T> queryDescs_, DevMem2D_<T> trainDescs_, Mask mask, PtrStepf distance)
{ {
__shared__ float sdiff[BLOCK_DIM_X * BLOCK_DIM_Y]; __shared__ typename Dist::ResultType sdiff[BLOCK_DIM_X * BLOCK_DIM_Y];
float* sdiff_row = sdiff + BLOCK_DIM_X * threadIdx.y; typename Dist::ResultType* sdiff_row = sdiff + BLOCK_DIM_X * threadIdx.y;
const int queryIdx = blockIdx.x; const int queryIdx = blockIdx.x;
const T* queryDescs = queryDescs_.ptr(queryIdx); const T* queryDescs = queryDescs_.ptr(queryIdx);
@ -726,7 +811,7 @@ namespace cv { namespace gpu { namespace bfmatcher
{ {
const T* trainDescs = trainDescs_.ptr(trainIdx); const T* trainDescs = trainDescs_.ptr(trainIdx);
float myDist = numeric_limits_gpu<float>::max(); typename Dist::ResultType myDist = numeric_limits_gpu<typename Dist::ResultType>::max();
if (mask(queryIdx, trainIdx)) if (mask(queryIdx, trainIdx))
{ {
@ -763,14 +848,14 @@ namespace cv { namespace gpu { namespace bfmatcher
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// warpReduceMinIdx // warpReduceMinIdx
template <int BLOCK_SIZE> template <int BLOCK_SIZE, typename T>
__device__ void warpReduceMinIdx(volatile float* sdist, volatile int* strainIdx, float& myMin, int tid) __device__ void warpReduceMinIdx(volatile T* sdist, volatile int* strainIdx, T& myMin, int tid)
{ {
if (tid < 32) if (tid < 32)
{ {
if (BLOCK_SIZE >= 64) if (BLOCK_SIZE >= 64)
{ {
float reg = sdist[tid + 32]; T reg = sdist[tid + 32];
if (reg < myMin) if (reg < myMin)
{ {
@ -780,7 +865,7 @@ namespace cv { namespace gpu { namespace bfmatcher
} }
if (BLOCK_SIZE >= 32) if (BLOCK_SIZE >= 32)
{ {
float reg = sdist[tid + 16]; T reg = sdist[tid + 16];
if (reg < myMin) if (reg < myMin)
{ {
@ -790,7 +875,7 @@ namespace cv { namespace gpu { namespace bfmatcher
} }
if (BLOCK_SIZE >= 16) if (BLOCK_SIZE >= 16)
{ {
float reg = sdist[tid + 8]; T reg = sdist[tid + 8];
if (reg < myMin) if (reg < myMin)
{ {
@ -800,7 +885,7 @@ namespace cv { namespace gpu { namespace bfmatcher
} }
if (BLOCK_SIZE >= 8) if (BLOCK_SIZE >= 8)
{ {
float reg = sdist[tid + 4]; T reg = sdist[tid + 4];
if (reg < myMin) if (reg < myMin)
{ {
@ -810,7 +895,7 @@ namespace cv { namespace gpu { namespace bfmatcher
} }
if (BLOCK_SIZE >= 4) if (BLOCK_SIZE >= 4)
{ {
float reg = sdist[tid + 2]; T reg = sdist[tid + 2];
if (reg < myMin) if (reg < myMin)
{ {
@ -820,7 +905,7 @@ namespace cv { namespace gpu { namespace bfmatcher
} }
if (BLOCK_SIZE >= 2) if (BLOCK_SIZE >= 2)
{ {
float reg = sdist[tid + 1]; T reg = sdist[tid + 1];
if (reg < myMin) if (reg < myMin)
{ {
@ -831,17 +916,17 @@ namespace cv { namespace gpu { namespace bfmatcher
} }
} }
template <int BLOCK_SIZE> template <int BLOCK_SIZE, typename T>
__device__ void reduceMinIdx(const float* dist, int n, float* sdist, int* strainIdx) __device__ void reduceMinIdx(const T* dist, int n, T* sdist, int* strainIdx)
{ {
const int tid = threadIdx.x; const int tid = threadIdx.x;
float myMin = numeric_limits_gpu<float>::max(); T myMin = numeric_limits_gpu<T>::max();
int myMinIdx = -1; int myMinIdx = -1;
for (int i = tid; i < n; i += BLOCK_SIZE) for (int i = tid; i < n; i += BLOCK_SIZE)
{ {
float reg = dist[i]; T reg = dist[i];
if (reg < myMin) if (reg < myMin)
{ {
myMin = reg; myMin = reg;
@ -855,7 +940,7 @@ namespace cv { namespace gpu { namespace bfmatcher
if (BLOCK_SIZE >= 512 && tid < 256) if (BLOCK_SIZE >= 512 && tid < 256)
{ {
float reg = sdist[tid + 256]; T reg = sdist[tid + 256];
if (reg < myMin) if (reg < myMin)
{ {
@ -866,7 +951,7 @@ namespace cv { namespace gpu { namespace bfmatcher
} }
if (BLOCK_SIZE >= 256 && tid < 128) if (BLOCK_SIZE >= 256 && tid < 128)
{ {
float reg = sdist[tid + 128]; T reg = sdist[tid + 128];
if (reg < myMin) if (reg < myMin)
{ {
@ -877,7 +962,7 @@ namespace cv { namespace gpu { namespace bfmatcher
} }
if (BLOCK_SIZE >= 128 && tid < 64) if (BLOCK_SIZE >= 128 && tid < 64)
{ {
float reg = sdist[tid + 64]; T reg = sdist[tid + 64];
if (reg < myMin) if (reg < myMin)
{ {
@ -943,14 +1028,12 @@ namespace cv { namespace gpu { namespace bfmatcher
// knn match caller // knn match caller
template <typename Dist, typename T, typename Mask> template <typename Dist, typename T, typename Mask>
void calcDistanceDispatcher(const DevMem2D_<T>& queryDescs, const DevMem2D_<T>& trainDescs, void calcDistanceDispatcher(const DevMem2D_<T>& queryDescs, const DevMem2D_<T>& trainDescs, const Mask& mask, const DevMem2Df& allDist)
const Mask& mask, const DevMem2Df& allDist)
{ {
calcDistance_caller<16, 16, Dist>(queryDescs, trainDescs, mask, allDist); calcDistance_caller<16, 16, Dist>(queryDescs, trainDescs, mask, allDist);
} }
void findKnnMatchDispatcher(int knn, const DevMem2Di& trainIdx, const DevMem2Df& distance, void findKnnMatchDispatcher(int knn, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist)
const DevMem2Df& allDist)
{ {
findKnnMatch_caller<256>(knn, trainIdx, distance, allDist); findKnnMatch_caller<256>(knn, trainIdx, distance, allDist);
} }
@ -961,13 +1044,11 @@ namespace cv { namespace gpu { namespace bfmatcher
{ {
if (mask.data) if (mask.data)
{ {
calcDistanceDispatcher<L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, calcDistanceDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, SingleMask(mask), allDist);
SingleMask(mask), allDist);
} }
else else
{ {
calcDistanceDispatcher<L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, calcDistanceDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, WithOutMask(), allDist);
WithOutMask(), allDist);
} }
findKnnMatchDispatcher(knn, trainIdx, distance, allDist); findKnnMatchDispatcher(knn, trainIdx, distance, allDist);
@ -1005,6 +1086,30 @@ namespace cv { namespace gpu { namespace bfmatcher
template void knnMatchL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); template void knnMatchL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template void knnMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); template void knnMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template <typename T>
void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist)
{
if (mask.data)
{
calcDistanceDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
SingleMask(mask), allDist);
}
else
{
calcDistanceDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
WithOutMask(), allDist);
}
findKnnMatchDispatcher(knn, trainIdx, distance, allDist);
}
template void knnMatchHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template void knnMatchHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template void knnMatchHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template void knnMatchHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template void knnMatchHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
/////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////// Radius Match ////////////////////////////////// /////////////////////////////////// Radius Match //////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////
@ -1018,9 +1123,9 @@ namespace cv { namespace gpu { namespace bfmatcher
{ {
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ float smem[BLOCK_DIM_X * BLOCK_DIM_Y]; __shared__ typename Dist::ResultType smem[BLOCK_DIM_X * BLOCK_DIM_Y];
float* sdiff_row = smem + BLOCK_DIM_X * threadIdx.y; typename Dist::ResultType* sdiff_row = smem + BLOCK_DIM_X * threadIdx.y;
const int queryIdx = blockIdx.x; const int queryIdx = blockIdx.x;
const T* queryDescs = queryDescs_.ptr(queryIdx); const T* queryDescs = queryDescs_.ptr(queryIdx);
@ -1091,12 +1196,12 @@ namespace cv { namespace gpu { namespace bfmatcher
{ {
if (mask.data) if (mask.data)
{ {
radiusMatchDispatcher<L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, radiusMatchDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
maxDistance, SingleMask(mask), trainIdx, nMatches, distance); maxDistance, SingleMask(mask), trainIdx, nMatches, distance);
} }
else else
{ {
radiusMatchDispatcher<L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, radiusMatchDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
maxDistance, WithOutMask(), trainIdx, nMatches, distance); maxDistance, WithOutMask(), trainIdx, nMatches, distance);
} }
} }
@ -1130,4 +1235,26 @@ namespace cv { namespace gpu { namespace bfmatcher
template void radiusMatchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); template void radiusMatchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template void radiusMatchL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); template void radiusMatchL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template void radiusMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); template void radiusMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template <typename T>
void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,
const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance)
{
if (mask.data)
{
radiusMatchDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
maxDistance, SingleMask(mask), trainIdx, nMatches, distance);
}
else
{
radiusMatchDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
maxDistance, WithOutMask(), trainIdx, nMatches, distance);
}
}
template void radiusMatchHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template void radiusMatchHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template void radiusMatchHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template void radiusMatchHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template void radiusMatchHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
}}} }}}