Revert "Revert "Merge pull request #836 from jet47:gpu-modules""

This commit is contained in:
Vladislav Vinogradov
2013-06-04 13:32:35 +04:00
parent 10340fe234
commit 3eeaa9189c
472 changed files with 29894 additions and 23019 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,774 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/utility.hpp"
#include "opencv2/core/cuda/reduce.hpp"
#include "opencv2/core/cuda/limits.hpp"
#include "opencv2/core/cuda/vec_distance.hpp"
#include "opencv2/core/cuda/datamov_utils.hpp"
namespace cv { namespace gpu { namespace cudev
{
namespace bf_match
{
///////////////////////////////////////////////////////////////////////////////
// Reduction
template <int BLOCK_SIZE>
__device__ void findBestMatch(float& bestDistance, int& bestTrainIdx, float* s_distance, int* s_trainIdx)
{
s_distance += threadIdx.y * BLOCK_SIZE;
s_trainIdx += threadIdx.y * BLOCK_SIZE;
reduceKeyVal<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less<float>());
}
template <int BLOCK_SIZE>
__device__ void findBestMatch(float& bestDistance, int& bestTrainIdx, int& bestImgIdx, float* s_distance, int* s_trainIdx, int* s_imgIdx)
{
s_distance += threadIdx.y * BLOCK_SIZE;
s_trainIdx += threadIdx.y * BLOCK_SIZE;
s_imgIdx += threadIdx.y * BLOCK_SIZE;
reduceKeyVal<BLOCK_SIZE>(s_distance, bestDistance, smem_tuple(s_trainIdx, s_imgIdx), thrust::tie(bestTrainIdx, bestImgIdx), threadIdx.x, less<float>());
}
///////////////////////////////////////////////////////////////////////////////
// Match Unrolled Cached
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename T, typename U>
__device__ void loadQueryToSmem(int queryIdx, const PtrStepSz<T>& query, U* s_query)
{
#pragma unroll
for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
{
const int loadX = threadIdx.x + i * BLOCK_SIZE;
s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(::min(queryIdx, query.rows - 1))[loadX] : 0;
}
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__device__ void loopUnrolledCached(int queryIdx, const PtrStepSz<T>& query,volatile int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
typename Dist::value_type* s_query, typename Dist::value_type* s_train,
float& bestDistance, int& bestTrainIdx, int& bestImgIdx)
{
for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
{
Dist dist;
#pragma unroll
for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
{
const int loadX = threadIdx.x + i * BLOCK_SIZE;
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
if (loadX < train.cols)
{
T val;
ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
}
__syncthreads();
#pragma unroll
for (int j = 0; j < BLOCK_SIZE; ++j)
dist.reduceIter(s_query[threadIdx.y * MAX_DESC_LEN + i * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
__syncthreads();
}
typename Dist::result_type distVal = dist;
const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx))
{
bestImgIdx = imgIdx;
bestDistance = distVal;
bestTrainIdx = trainIdx;
}
}
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
{
extern __shared__ int smem[];
const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
float myBestDistance = numeric_limits<float>::max();
int myBestTrainIdx = -1;
loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx);
__syncthreads();
float* s_distance = (float*)(smem);
int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx);
if (queryIdx < query.rows && threadIdx.x == 0)
{
bestTrainIdx[queryIdx] = myBestTrainIdx;
bestDistance[queryIdx] = myBestDistance;
}
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
void matchUnrolledCached(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
cudaStream_t stream)
{
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(query.rows, BLOCK_SIZE));
const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask,
int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
{
extern __shared__ int smem[];
const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
float myBestDistance = numeric_limits<float>::max();
int myBestTrainIdx = -1;
int myBestImgIdx = -1;
Mask m = mask;
for (int imgIdx = 0; imgIdx < n; ++imgIdx)
{
const PtrStepSz<T> train = trains[imgIdx];
m.next();
loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx);
}
__syncthreads();
float* s_distance = (float*)(smem);
int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdx);
if (queryIdx < query.rows && threadIdx.x == 0)
{
bestTrainIdx[queryIdx] = myBestTrainIdx;
bestImgIdx[queryIdx] = myBestImgIdx;
bestDistance[queryIdx] = myBestDistance;
}
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
void matchUnrolledCached(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
cudaStream_t stream)
{
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(query.rows, BLOCK_SIZE));
const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= 2 * BLOCK_SIZE ? MAX_DESC_LEN : 2 * BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
///////////////////////////////////////////////////////////////////////////////
// Match Unrolled
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__device__ void loopUnrolled(int queryIdx, const PtrStepSz<T>& query,volatile int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
typename Dist::value_type* s_query, typename Dist::value_type* s_train,
float& bestDistance, int& bestTrainIdx, int& bestImgIdx)
{
for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
{
Dist dist;
#pragma unroll
for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
{
const int loadX = threadIdx.x + i * BLOCK_SIZE;
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
if (loadX < query.cols)
{
T val;
ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
}
__syncthreads();
#pragma unroll
for (int j = 0; j < BLOCK_SIZE; ++j)
dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
__syncthreads();
}
typename Dist::result_type distVal = dist;
const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx))
{
bestImgIdx = imgIdx;
bestDistance = distVal;
bestTrainIdx = trainIdx;
}
}
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
{
extern __shared__ int smem[];
const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
float myBestDistance = numeric_limits<float>::max();
int myBestTrainIdx = -1;
typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx);
__syncthreads();
float* s_distance = (float*)(smem);
int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx);
if (queryIdx < query.rows && threadIdx.x == 0)
{
bestTrainIdx[queryIdx] = myBestTrainIdx;
bestDistance[queryIdx] = myBestDistance;
}
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
cudaStream_t stream)
{
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(query.rows, BLOCK_SIZE));
const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask,
int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
{
extern __shared__ int smem[];
const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
float myBestDistance = numeric_limits<float>::max();
int myBestTrainIdx = -1;
int myBestImgIdx = -1;
typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
Mask m = mask;
for (int imgIdx = 0; imgIdx < n; ++imgIdx)
{
const PtrStepSz<T> train = trains[imgIdx];
m.next();
loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx);
}
__syncthreads();
float* s_distance = (float*)(smem);
int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
int* s_imgIdxIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdxIdx);
if (queryIdx < query.rows && threadIdx.x == 0)
{
bestTrainIdx[queryIdx] = myBestTrainIdx;
bestImgIdx[queryIdx] = myBestImgIdx;
bestDistance[queryIdx] = myBestDistance;
}
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
cudaStream_t stream)
{
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(query.rows, BLOCK_SIZE));
const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
///////////////////////////////////////////////////////////////////////////////
// Match
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
__device__ void loop(int queryIdx, const PtrStepSz<T>& query, volatile int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
typename Dist::value_type* s_query, typename Dist::value_type* s_train,
float& bestDistance, int& bestTrainIdx, int& bestImgIdx)
{
for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
{
Dist dist;
for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
{
const int loadX = threadIdx.x + i * BLOCK_SIZE;
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
if (loadX < query.cols)
{
T val;
ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
}
__syncthreads();
#pragma unroll
for (int j = 0; j < BLOCK_SIZE; ++j)
dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
__syncthreads();
}
typename Dist::result_type distVal = dist;
const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx))
{
bestImgIdx = imgIdx;
bestDistance = distVal;
bestTrainIdx = trainIdx;
}
}
}
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
__global__ void match(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
{
extern __shared__ int smem[];
const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
float myBestDistance = numeric_limits<float>::max();
int myBestTrainIdx = -1;
typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
loop<BLOCK_SIZE, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx);
__syncthreads();
float* s_distance = (float*)(smem);
int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx);
if (queryIdx < query.rows && threadIdx.x == 0)
{
bestTrainIdx[queryIdx] = myBestTrainIdx;
bestDistance[queryIdx] = myBestDistance;
}
}
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
void match(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
cudaStream_t stream)
{
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(query.rows, BLOCK_SIZE));
const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
__global__ void match(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask,
int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
{
extern __shared__ int smem[];
const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
float myBestDistance = numeric_limits<float>::max();
int myBestTrainIdx = -1;
int myBestImgIdx = -1;
typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
Mask m = mask;
for (int imgIdx = 0; imgIdx < n; ++imgIdx)
{
const PtrStepSz<T> train = trains[imgIdx];
m.next();
loop<BLOCK_SIZE, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx);
}
__syncthreads();
float* s_distance = (float*)(smem);
int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
int* s_imgIdxIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdxIdx);
if (queryIdx < query.rows && threadIdx.x == 0)
{
bestTrainIdx[queryIdx] = myBestTrainIdx;
bestImgIdx[queryIdx] = myBestImgIdx;
bestDistance[queryIdx] = myBestDistance;
}
}
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
void match(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
cudaStream_t stream)
{
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(query.rows, BLOCK_SIZE));
const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
///////////////////////////////////////////////////////////////////////////////
// Match dispatcher
template <typename Dist, typename T, typename Mask>
void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
cudaStream_t stream)
{
if (query.cols <= 64)
{
matchUnrolledCached<16, 64, Dist>(query, train, mask, trainIdx, distance, stream);
}
else if (query.cols <= 128)
{
matchUnrolledCached<16, 128, Dist>(query, train, mask, trainIdx, distance, stream);
}
/*else if (query.cols <= 256)
{
matchUnrolled<16, 256, Dist>(query, train, mask, trainIdx, distance, stream);
}
else if (query.cols <= 512)
{
matchUnrolled<16, 512, Dist>(query, train, mask, trainIdx, distance, stream);
}
else if (query.cols <= 1024)
{
matchUnrolled<16, 1024, Dist>(query, train, mask, trainIdx, distance, stream);
}*/
else
{
match<16, Dist>(query, train, mask, trainIdx, distance, stream);
}
}
template <typename Dist, typename T, typename Mask>
void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
cudaStream_t stream)
{
if (query.cols <= 64)
{
matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
}
else if (query.cols <= 128)
{
matchUnrolledCached<16, 128, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
}
/*else if (query.cols <= 256)
{
matchUnrolled<16, 256, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
}
else if (query.cols <= 512)
{
matchUnrolled<16, 512, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
}
else if (query.cols <= 1024)
{
matchUnrolled<16, 1024, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
}*/
else
{
match<16, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
}
}
///////////////////////////////////////////////////////////////////////////////
// Match caller
template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
cudaStream_t stream)
{
if (mask.data)
{
matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), SingleMask(mask),
trainIdx, distance,
stream);
}
else
{
matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), WithOutMask(),
trainIdx, distance,
stream);
}
}
template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
//template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
template void matchL1_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
cudaStream_t stream)
{
if (mask.data)
{
matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), SingleMask(mask),
trainIdx, distance,
stream);
}
else
{
matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), WithOutMask(),
trainIdx, distance,
stream);
}
}
//template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
//template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
//template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
//template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
//template void matchL2_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
cudaStream_t stream)
{
if (mask.data)
{
matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), SingleMask(mask),
trainIdx, distance,
stream);
}
else
{
matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), WithOutMask(),
trainIdx, distance,
stream);
}
}
template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
//template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
//template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
template void matchHamming_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
cudaStream_t stream)
{
if (masks.data)
{
matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
trainIdx, imgIdx, distance,
stream);
}
else
{
matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(),
trainIdx, imgIdx, distance,
stream);
}
}
template void matchL1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
//template void matchL1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
template void matchL1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
template void matchL1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
template void matchL1_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
template void matchL1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
cudaStream_t stream)
{
if (masks.data)
{
matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
trainIdx, imgIdx, distance,
stream);
}
else
{
matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(),
trainIdx, imgIdx, distance,
stream);
}
}
//template void matchL2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
//template void matchL2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
//template void matchL2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
//template void matchL2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
//template void matchL2_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
template void matchL2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& maskCollection, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
cudaStream_t stream)
{
if (masks.data)
{
matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
trainIdx, imgIdx, distance,
stream);
}
else
{
matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(),
trainIdx, imgIdx, distance,
stream);
}
}
template void matchHamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
//template void matchHamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
template void matchHamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
//template void matchHamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
template void matchHamming_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
} // namespace bf_match
}}} // namespace cv { namespace gpu { namespace cudev {
#endif /* CUDA_DISABLER */

View File

@@ -0,0 +1,463 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/utility.hpp"
#include "opencv2/core/cuda/limits.hpp"
#include "opencv2/core/cuda/vec_distance.hpp"
#include "opencv2/core/cuda/datamov_utils.hpp"
namespace cv { namespace gpu { namespace cudev
{
namespace bf_radius_match
{
///////////////////////////////////////////////////////////////////////////////
// Match Unrolled
template <int BLOCK_SIZE, int MAX_DESC_LEN, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
__global__ void matchUnrolled(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask,
PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
{
extern __shared__ int smem[];
const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
Dist dist;
#pragma unroll
for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
{
const int loadX = threadIdx.x + i * BLOCK_SIZE;
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
if (loadX < query.cols)
{
T val;
ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
ForceGlob<T>::Load(train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
}
__syncthreads();
#pragma unroll
for (int j = 0; j < BLOCK_SIZE; ++j)
dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
__syncthreads();
}
float distVal = (typename Dist::result_type)dist;
if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance)
{
unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1);
if (ind < maxCount)
{
bestTrainIdx.ptr(queryIdx)[ind] = trainIdx;
if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx;
bestDistance.ptr(queryIdx)[ind] = distVal;
}
}
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>& train, float maxDistance, const Mask& mask,
const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream)
{
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, false, Dist><<<grid, block, smemSize, stream>>>(query, 0, train, maxDistance, mask,
trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T>
void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, float maxDistance, const PtrStepSzb* masks,
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
cudaStream_t stream)
{
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
for (int i = 0; i < n; ++i)
{
const PtrStepSz<T> train = trains[i];
const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
if (masks != 0 && masks[i].data)
{
matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, SingleMask(masks[i]),
trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
}
else
{
matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, WithOutMask(),
trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
}
cudaSafeCall( cudaGetLastError() );
}
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
///////////////////////////////////////////////////////////////////////////////
// Match
template <int BLOCK_SIZE, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
__global__ void match(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask,
PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
{
extern __shared__ int smem[];
const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
Dist dist;
for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
{
const int loadX = threadIdx.x + i * BLOCK_SIZE;
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
if (loadX < query.cols)
{
T val;
ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
ForceGlob<T>::Load(train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
}
__syncthreads();
#pragma unroll
for (int j = 0; j < BLOCK_SIZE; ++j)
dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
__syncthreads();
}
float distVal = (typename Dist::result_type)dist;
if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance)
{
unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1);
if (ind < maxCount)
{
bestTrainIdx.ptr(queryIdx)[ind] = trainIdx;
if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx;
bestDistance.ptr(queryIdx)[ind] = distVal;
}
}
}
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
void match(const PtrStepSz<T>& query, const PtrStepSz<T>& train, float maxDistance, const Mask& mask,
const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
cudaStream_t stream)
{
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
match<BLOCK_SIZE, false, Dist><<<grid, block, smemSize, stream>>>(query, 0, train, maxDistance, mask,
trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <int BLOCK_SIZE, typename Dist, typename T>
void match(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, float maxDistance, const PtrStepSzb* masks,
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
cudaStream_t stream)
{
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
for (int i = 0; i < n; ++i)
{
const PtrStepSz<T> train = trains[i];
const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
if (masks != 0 && masks[i].data)
{
match<BLOCK_SIZE, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, SingleMask(masks[i]),
trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
}
else
{
match<BLOCK_SIZE, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, WithOutMask(),
trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
}
cudaSafeCall( cudaGetLastError() );
}
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
///////////////////////////////////////////////////////////////////////////////
// Match dispatcher
template <typename Dist, typename T, typename Mask>
void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, float maxDistance, const Mask& mask,
const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
cudaStream_t stream)
{
if (query.cols <= 64)
{
matchUnrolled<16, 64, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
}
else if (query.cols <= 128)
{
matchUnrolled<16, 128, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
}
/*else if (query.cols <= 256)
{
matchUnrolled<16, 256, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
}
else if (query.cols <= 512)
{
matchUnrolled<16, 512, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
}
else if (query.cols <= 1024)
{
matchUnrolled<16, 1024, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
}*/
else
{
match<16, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
}
}
template <typename Dist, typename T>
void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, float maxDistance, const PtrStepSzb* masks,
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
cudaStream_t stream)
{
if (query.cols <= 64)
{
matchUnrolled<16, 64, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
}
else if (query.cols <= 128)
{
matchUnrolled<16, 128, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
}
/*else if (query.cols <= 256)
{
matchUnrolled<16, 256, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
}
else if (query.cols <= 512)
{
matchUnrolled<16, 512, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
}
else if (query.cols <= 1024)
{
matchUnrolled<16, 1024, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
}*/
else
{
match<16, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
}
}
///////////////////////////////////////////////////////////////////////////////
// Radius Match caller
template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
cudaStream_t stream)
{
if (mask.data)
{
matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, SingleMask(mask),
trainIdx, distance, nMatches,
stream);
}
else
{
matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, WithOutMask(),
trainIdx, distance, nMatches,
stream);
}
}
template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
//template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template void matchL1_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
cudaStream_t stream)
{
if (mask.data)
{
matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, SingleMask(mask),
trainIdx, distance, nMatches,
stream);
}
else
{
matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, WithOutMask(),
trainIdx, distance, nMatches,
stream);
}
}
//template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
//template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
//template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
//template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
//template void matchL2_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
cudaStream_t stream)
{
if (mask.data)
{
matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, SingleMask(mask),
trainIdx, distance, nMatches,
stream);
}
else
{
matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, WithOutMask(),
trainIdx, distance, nMatches,
stream);
}
}
template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
//template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
//template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template void matchHamming_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
cudaStream_t stream)
{
matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains, n, maxDistance, masks,
trainIdx, imgIdx, distance, nMatches,
stream);
}
template void matchL1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
//template void matchL1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template void matchL1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template void matchL1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template void matchL1_gpu<int >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template void matchL1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
cudaStream_t stream)
{
matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains, n, maxDistance, masks,
trainIdx, imgIdx, distance, nMatches,
stream);
}
//template void matchL2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
//template void matchL2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
//template void matchL2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
//template void matchL2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
//template void matchL2_gpu<int >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template void matchL2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
cudaStream_t stream)
{
matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains, n, maxDistance, masks,
trainIdx, imgIdx, distance, nMatches,
stream);
}
template void matchHamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
//template void matchHamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template void matchHamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
//template void matchHamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template void matchHamming_gpu<int >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
} // namespace bf_radius_match
}}} // namespace cv { namespace gpu { namespace cudev
#endif /* CUDA_DISABLER */

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,424 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include <thrust/device_ptr.h>
#include <thrust/sort.h>
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/reduce.hpp"
#include "opencv2/core/cuda/functional.hpp"
namespace cv { namespace gpu { namespace cudev
{
namespace orb
{
////////////////////////////////////////////////////////////////////////////////////////////////////////
// cull
int cull_gpu(int* loc, float* response, int size, int n_points)
{
thrust::device_ptr<int> loc_ptr(loc);
thrust::device_ptr<float> response_ptr(response);
thrust::sort_by_key(response_ptr, response_ptr + size, loc_ptr, thrust::greater<float>());
return n_points;
}
////////////////////////////////////////////////////////////////////////////////////////////////////////
// HarrisResponses
__global__ void HarrisResponses(const PtrStepb img, const short2* loc_, float* response, const int npoints, const int blockSize, const float harris_k)
{
__shared__ int smem0[8 * 32];
__shared__ int smem1[8 * 32];
__shared__ int smem2[8 * 32];
const int ptidx = blockIdx.x * blockDim.y + threadIdx.y;
if (ptidx < npoints)
{
const short2 loc = loc_[ptidx];
const int r = blockSize / 2;
const int x0 = loc.x - r;
const int y0 = loc.y - r;
int a = 0, b = 0, c = 0;
for (int ind = threadIdx.x; ind < blockSize * blockSize; ind += blockDim.x)
{
const int i = ind / blockSize;
const int j = ind % blockSize;
int Ix = (img(y0 + i, x0 + j + 1) - img(y0 + i, x0 + j - 1)) * 2 +
(img(y0 + i - 1, x0 + j + 1) - img(y0 + i - 1, x0 + j - 1)) +
(img(y0 + i + 1, x0 + j + 1) - img(y0 + i + 1, x0 + j - 1));
int Iy = (img(y0 + i + 1, x0 + j) - img(y0 + i - 1, x0 + j)) * 2 +
(img(y0 + i + 1, x0 + j - 1) - img(y0 + i - 1, x0 + j - 1)) +
(img(y0 + i + 1, x0 + j + 1) - img(y0 + i - 1, x0 + j + 1));
a += Ix * Ix;
b += Iy * Iy;
c += Ix * Iy;
}
int* srow0 = smem0 + threadIdx.y * blockDim.x;
int* srow1 = smem1 + threadIdx.y * blockDim.x;
int* srow2 = smem2 + threadIdx.y * blockDim.x;
plus<int> op;
reduce<32>(smem_tuple(srow0, srow1, srow2), thrust::tie(a, b, c), threadIdx.x, thrust::make_tuple(op, op, op));
if (threadIdx.x == 0)
{
float scale = (1 << 2) * blockSize * 255.0f;
scale = 1.0f / scale;
const float scale_sq_sq = scale * scale * scale * scale;
response[ptidx] = ((float)a * b - (float)c * c - harris_k * ((float)a + b) * ((float)a + b)) * scale_sq_sq;
}
}
}
void HarrisResponses_gpu(PtrStepSzb img, const short2* loc, float* response, const int npoints, int blockSize, float harris_k, cudaStream_t stream)
{
dim3 block(32, 8);
dim3 grid;
grid.x = divUp(npoints, block.y);
HarrisResponses<<<grid, block, 0, stream>>>(img, loc, response, npoints, blockSize, harris_k);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
////////////////////////////////////////////////////////////////////////////////////////////////////////
// IC_Angle
__constant__ int c_u_max[32];
void loadUMax(const int* u_max, int count)
{
cudaSafeCall( cudaMemcpyToSymbol(c_u_max, u_max, count * sizeof(int)) );
}
__global__ void IC_Angle(const PtrStepb image, const short2* loc_, float* angle, const int npoints, const int half_k)
{
__shared__ int smem0[8 * 32];
__shared__ int smem1[8 * 32];
int* srow0 = smem0 + threadIdx.y * blockDim.x;
int* srow1 = smem1 + threadIdx.y * blockDim.x;
plus<int> op;
const int ptidx = blockIdx.x * blockDim.y + threadIdx.y;
if (ptidx < npoints)
{
int m_01 = 0, m_10 = 0;
const short2 loc = loc_[ptidx];
// Treat the center line differently, v=0
for (int u = threadIdx.x - half_k; u <= half_k; u += blockDim.x)
m_10 += u * image(loc.y, loc.x + u);
reduce<32>(srow0, m_10, threadIdx.x, op);
for (int v = 1; v <= half_k; ++v)
{
// Proceed over the two lines
int v_sum = 0;
int m_sum = 0;
const int d = c_u_max[v];
for (int u = threadIdx.x - d; u <= d; u += blockDim.x)
{
int val_plus = image(loc.y + v, loc.x + u);
int val_minus = image(loc.y - v, loc.x + u);
v_sum += (val_plus - val_minus);
m_sum += u * (val_plus + val_minus);
}
reduce<32>(smem_tuple(srow0, srow1), thrust::tie(v_sum, m_sum), threadIdx.x, thrust::make_tuple(op, op));
m_10 += m_sum;
m_01 += v * v_sum;
}
if (threadIdx.x == 0)
{
float kp_dir = ::atan2f((float)m_01, (float)m_10);
kp_dir += (kp_dir < 0) * (2.0f * CV_PI);
kp_dir *= 180.0f / CV_PI;
angle[ptidx] = kp_dir;
}
}
}
void IC_Angle_gpu(PtrStepSzb image, const short2* loc, float* angle, int npoints, int half_k, cudaStream_t stream)
{
dim3 block(32, 8);
dim3 grid;
grid.x = divUp(npoints, block.y);
IC_Angle<<<grid, block, 0, stream>>>(image, loc, angle, npoints, half_k);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
////////////////////////////////////////////////////////////////////////////////////////////////////////
// computeOrbDescriptor
template <int WTA_K> struct OrbDescriptor;
#define GET_VALUE(idx) \
img(loc.y + __float2int_rn(pattern_x[idx] * sina + pattern_y[idx] * cosa), \
loc.x + __float2int_rn(pattern_x[idx] * cosa - pattern_y[idx] * sina))
template <> struct OrbDescriptor<2>
{
__device__ static int calc(const PtrStepb& img, short2 loc, const int* pattern_x, const int* pattern_y, float sina, float cosa, int i)
{
pattern_x += 16 * i;
pattern_y += 16 * i;
int t0, t1, val;
t0 = GET_VALUE(0); t1 = GET_VALUE(1);
val = t0 < t1;
t0 = GET_VALUE(2); t1 = GET_VALUE(3);
val |= (t0 < t1) << 1;
t0 = GET_VALUE(4); t1 = GET_VALUE(5);
val |= (t0 < t1) << 2;
t0 = GET_VALUE(6); t1 = GET_VALUE(7);
val |= (t0 < t1) << 3;
t0 = GET_VALUE(8); t1 = GET_VALUE(9);
val |= (t0 < t1) << 4;
t0 = GET_VALUE(10); t1 = GET_VALUE(11);
val |= (t0 < t1) << 5;
t0 = GET_VALUE(12); t1 = GET_VALUE(13);
val |= (t0 < t1) << 6;
t0 = GET_VALUE(14); t1 = GET_VALUE(15);
val |= (t0 < t1) << 7;
return val;
}
};
template <> struct OrbDescriptor<3>
{
__device__ static int calc(const PtrStepb& img, short2 loc, const int* pattern_x, const int* pattern_y, float sina, float cosa, int i)
{
pattern_x += 12 * i;
pattern_y += 12 * i;
int t0, t1, t2, val;
t0 = GET_VALUE(0); t1 = GET_VALUE(1); t2 = GET_VALUE(2);
val = t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0);
t0 = GET_VALUE(3); t1 = GET_VALUE(4); t2 = GET_VALUE(5);
val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 2;
t0 = GET_VALUE(6); t1 = GET_VALUE(7); t2 = GET_VALUE(8);
val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 4;
t0 = GET_VALUE(9); t1 = GET_VALUE(10); t2 = GET_VALUE(11);
val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 6;
return val;
}
};
template <> struct OrbDescriptor<4>
{
__device__ static int calc(const PtrStepb& img, short2 loc, const int* pattern_x, const int* pattern_y, float sina, float cosa, int i)
{
pattern_x += 16 * i;
pattern_y += 16 * i;
int t0, t1, t2, t3, k, val;
int a, b;
t0 = GET_VALUE(0); t1 = GET_VALUE(1);
t2 = GET_VALUE(2); t3 = GET_VALUE(3);
a = 0, b = 2;
if( t1 > t0 ) t0 = t1, a = 1;
if( t3 > t2 ) t2 = t3, b = 3;
k = t0 > t2 ? a : b;
val = k;
t0 = GET_VALUE(4); t1 = GET_VALUE(5);
t2 = GET_VALUE(6); t3 = GET_VALUE(7);
a = 0, b = 2;
if( t1 > t0 ) t0 = t1, a = 1;
if( t3 > t2 ) t2 = t3, b = 3;
k = t0 > t2 ? a : b;
val |= k << 2;
t0 = GET_VALUE(8); t1 = GET_VALUE(9);
t2 = GET_VALUE(10); t3 = GET_VALUE(11);
a = 0, b = 2;
if( t1 > t0 ) t0 = t1, a = 1;
if( t3 > t2 ) t2 = t3, b = 3;
k = t0 > t2 ? a : b;
val |= k << 4;
t0 = GET_VALUE(12); t1 = GET_VALUE(13);
t2 = GET_VALUE(14); t3 = GET_VALUE(15);
a = 0, b = 2;
if( t1 > t0 ) t0 = t1, a = 1;
if( t3 > t2 ) t2 = t3, b = 3;
k = t0 > t2 ? a : b;
val |= k << 6;
return val;
}
};
#undef GET_VALUE
template <int WTA_K>
__global__ void computeOrbDescriptor(const PtrStepb img, const short2* loc, const float* angle_, const int npoints,
const int* pattern_x, const int* pattern_y, PtrStepb desc, int dsize)
{
const int descidx = blockIdx.x * blockDim.x + threadIdx.x;
const int ptidx = blockIdx.y * blockDim.y + threadIdx.y;
if (ptidx < npoints && descidx < dsize)
{
float angle = angle_[ptidx];
angle *= (float)(CV_PI / 180.f);
float sina, cosa;
::sincosf(angle, &sina, &cosa);
desc.ptr(ptidx)[descidx] = OrbDescriptor<WTA_K>::calc(img, loc[ptidx], pattern_x, pattern_y, sina, cosa, descidx);
}
}
void computeOrbDescriptor_gpu(PtrStepb img, const short2* loc, const float* angle, const int npoints,
const int* pattern_x, const int* pattern_y, PtrStepb desc, int dsize, int WTA_K, cudaStream_t stream)
{
dim3 block(32, 8);
dim3 grid;
grid.x = divUp(dsize, block.x);
grid.y = divUp(npoints, block.y);
switch (WTA_K)
{
case 2:
computeOrbDescriptor<2><<<grid, block, 0, stream>>>(img, loc, angle, npoints, pattern_x, pattern_y, desc, dsize);
break;
case 3:
computeOrbDescriptor<3><<<grid, block, 0, stream>>>(img, loc, angle, npoints, pattern_x, pattern_y, desc, dsize);
break;
case 4:
computeOrbDescriptor<4><<<grid, block, 0, stream>>>(img, loc, angle, npoints, pattern_x, pattern_y, desc, dsize);
break;
}
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
////////////////////////////////////////////////////////////////////////////////////////////////////////
// mergeLocation
__global__ void mergeLocation(const short2* loc_, float* x, float* y, const int npoints, float scale)
{
const int ptidx = blockIdx.x * blockDim.x + threadIdx.x;
if (ptidx < npoints)
{
short2 loc = loc_[ptidx];
x[ptidx] = loc.x * scale;
y[ptidx] = loc.y * scale;
}
}
void mergeLocation_gpu(const short2* loc, float* x, float* y, int npoints, float scale, cudaStream_t stream)
{
dim3 block(256);
dim3 grid;
grid.x = divUp(npoints, block.x);
mergeLocation<<<grid, block, 0, stream>>>(loc, x, y, npoints, scale);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
}
}}}
#endif /* CUDA_DISABLER */

View File

@@ -0,0 +1,170 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
using namespace cv;
using namespace cv::gpu;
#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
cv::gpu::FAST_GPU::FAST_GPU(int, bool, double) { throw_no_cuda(); }
void cv::gpu::FAST_GPU::operator ()(const GpuMat&, const GpuMat&, GpuMat&) { throw_no_cuda(); }
void cv::gpu::FAST_GPU::operator ()(const GpuMat&, const GpuMat&, std::vector<KeyPoint>&) { throw_no_cuda(); }
void cv::gpu::FAST_GPU::downloadKeypoints(const GpuMat&, std::vector<KeyPoint>&) { throw_no_cuda(); }
void cv::gpu::FAST_GPU::convertKeypoints(const Mat&, std::vector<KeyPoint>&) { throw_no_cuda(); }
void cv::gpu::FAST_GPU::release() { throw_no_cuda(); }
int cv::gpu::FAST_GPU::calcKeyPointsLocation(const GpuMat&, const GpuMat&) { throw_no_cuda(); return 0; }
int cv::gpu::FAST_GPU::getKeyPoints(GpuMat&) { throw_no_cuda(); return 0; }
#else /* !defined (HAVE_CUDA) */
cv::gpu::FAST_GPU::FAST_GPU(int _threshold, bool _nonmaxSupression, double _keypointsRatio) :
nonmaxSupression(_nonmaxSupression), threshold(_threshold), keypointsRatio(_keypointsRatio), count_(0)
{
}
void cv::gpu::FAST_GPU::operator ()(const GpuMat& image, const GpuMat& mask, std::vector<KeyPoint>& keypoints)
{
if (image.empty())
return;
(*this)(image, mask, d_keypoints_);
downloadKeypoints(d_keypoints_, keypoints);
}
void cv::gpu::FAST_GPU::downloadKeypoints(const GpuMat& d_keypoints, std::vector<KeyPoint>& keypoints)
{
if (d_keypoints.empty())
return;
Mat h_keypoints(d_keypoints);
convertKeypoints(h_keypoints, keypoints);
}
void cv::gpu::FAST_GPU::convertKeypoints(const Mat& h_keypoints, std::vector<KeyPoint>& keypoints)
{
if (h_keypoints.empty())
return;
CV_Assert(h_keypoints.rows == ROWS_COUNT && h_keypoints.elemSize() == 4);
int npoints = h_keypoints.cols;
keypoints.resize(npoints);
const short2* loc_row = h_keypoints.ptr<short2>(LOCATION_ROW);
const float* response_row = h_keypoints.ptr<float>(RESPONSE_ROW);
for (int i = 0; i < npoints; ++i)
{
KeyPoint kp(loc_row[i].x, loc_row[i].y, static_cast<float>(FEATURE_SIZE), -1, response_row[i]);
keypoints[i] = kp;
}
}
void cv::gpu::FAST_GPU::operator ()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints)
{
calcKeyPointsLocation(img, mask);
keypoints.cols = getKeyPoints(keypoints);
}
namespace cv { namespace gpu { namespace cudev
{
namespace fast
{
int calcKeypoints_gpu(PtrStepSzb img, PtrStepSzb mask, short2* kpLoc, int maxKeypoints, PtrStepSzi score, int threshold);
int nonmaxSupression_gpu(const short2* kpLoc, int count, PtrStepSzi score, short2* loc, float* response);
}
}}}
int cv::gpu::FAST_GPU::calcKeyPointsLocation(const GpuMat& img, const GpuMat& mask)
{
using namespace cv::gpu::cudev::fast;
CV_Assert(img.type() == CV_8UC1);
CV_Assert(mask.empty() || (mask.type() == CV_8UC1 && mask.size() == img.size()));
int maxKeypoints = static_cast<int>(keypointsRatio * img.size().area());
ensureSizeIsEnough(1, maxKeypoints, CV_16SC2, kpLoc_);
if (nonmaxSupression)
{
ensureSizeIsEnough(img.size(), CV_32SC1, score_);
score_.setTo(Scalar::all(0));
}
count_ = calcKeypoints_gpu(img, mask, kpLoc_.ptr<short2>(), maxKeypoints, nonmaxSupression ? score_ : PtrStepSzi(), threshold);
count_ = std::min(count_, maxKeypoints);
return count_;
}
int cv::gpu::FAST_GPU::getKeyPoints(GpuMat& keypoints)
{
using namespace cv::gpu::cudev::fast;
if (count_ == 0)
return 0;
ensureSizeIsEnough(ROWS_COUNT, count_, CV_32FC1, keypoints);
if (nonmaxSupression)
return nonmaxSupression_gpu(kpLoc_.ptr<short2>(), count_, score_, keypoints.ptr<short2>(LOCATION_ROW), keypoints.ptr<float>(RESPONSE_ROW));
GpuMat locRow(1, count_, kpLoc_.type(), keypoints.ptr(0));
kpLoc_.colRange(0, count_).copyTo(locRow);
keypoints.row(1).setTo(Scalar::all(0));
return count_;
}
void cv::gpu::FAST_GPU::release()
{
kpLoc_.release();
score_.release();
d_keypoints_.release();
}
#endif /* !defined (HAVE_CUDA) */

View File

@@ -0,0 +1,775 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
using namespace cv;
using namespace cv::gpu;
#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
cv::gpu::ORB_GPU::ORB_GPU(int, float, int, int, int, int, int, int) : fastDetector_(20) { throw_no_cuda(); }
void cv::gpu::ORB_GPU::operator()(const GpuMat&, const GpuMat&, std::vector<KeyPoint>&) { throw_no_cuda(); }
void cv::gpu::ORB_GPU::operator()(const GpuMat&, const GpuMat&, GpuMat&) { throw_no_cuda(); }
void cv::gpu::ORB_GPU::operator()(const GpuMat&, const GpuMat&, std::vector<KeyPoint>&, GpuMat&) { throw_no_cuda(); }
void cv::gpu::ORB_GPU::operator()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&) { throw_no_cuda(); }
void cv::gpu::ORB_GPU::downloadKeyPoints(const GpuMat&, std::vector<KeyPoint>&) { throw_no_cuda(); }
void cv::gpu::ORB_GPU::convertKeyPoints(const Mat&, std::vector<KeyPoint>&) { throw_no_cuda(); }
void cv::gpu::ORB_GPU::release() { throw_no_cuda(); }
void cv::gpu::ORB_GPU::buildScalePyramids(const GpuMat&, const GpuMat&) { throw_no_cuda(); }
void cv::gpu::ORB_GPU::computeKeyPointsPyramid() { throw_no_cuda(); }
void cv::gpu::ORB_GPU::computeDescriptors(GpuMat&) { throw_no_cuda(); }
void cv::gpu::ORB_GPU::mergeKeyPoints(GpuMat&) { throw_no_cuda(); }
#else /* !defined (HAVE_CUDA) */
namespace cv { namespace gpu { namespace cudev
{
namespace orb
{
int cull_gpu(int* loc, float* response, int size, int n_points);
void HarrisResponses_gpu(PtrStepSzb img, const short2* loc, float* response, const int npoints, int blockSize, float harris_k, cudaStream_t stream);
void loadUMax(const int* u_max, int count);
void IC_Angle_gpu(PtrStepSzb image, const short2* loc, float* angle, int npoints, int half_k, cudaStream_t stream);
void computeOrbDescriptor_gpu(PtrStepb img, const short2* loc, const float* angle, const int npoints,
const int* pattern_x, const int* pattern_y, PtrStepb desc, int dsize, int WTA_K, cudaStream_t stream);
void mergeLocation_gpu(const short2* loc, float* x, float* y, int npoints, float scale, cudaStream_t stream);
}
}}}
namespace
{
const float HARRIS_K = 0.04f;
const int DESCRIPTOR_SIZE = 32;
const int bit_pattern_31_[256 * 4] =
{
8,-3, 9,5/*mean (0), correlation (0)*/,
4,2, 7,-12/*mean (1.12461e-05), correlation (0.0437584)*/,
-11,9, -8,2/*mean (3.37382e-05), correlation (0.0617409)*/,
7,-12, 12,-13/*mean (5.62303e-05), correlation (0.0636977)*/,
2,-13, 2,12/*mean (0.000134953), correlation (0.085099)*/,
1,-7, 1,6/*mean (0.000528565), correlation (0.0857175)*/,
-2,-10, -2,-4/*mean (0.0188821), correlation (0.0985774)*/,
-13,-13, -11,-8/*mean (0.0363135), correlation (0.0899616)*/,
-13,-3, -12,-9/*mean (0.121806), correlation (0.099849)*/,
10,4, 11,9/*mean (0.122065), correlation (0.093285)*/,
-13,-8, -8,-9/*mean (0.162787), correlation (0.0942748)*/,
-11,7, -9,12/*mean (0.21561), correlation (0.0974438)*/,
7,7, 12,6/*mean (0.160583), correlation (0.130064)*/,
-4,-5, -3,0/*mean (0.228171), correlation (0.132998)*/,
-13,2, -12,-3/*mean (0.00997526), correlation (0.145926)*/,
-9,0, -7,5/*mean (0.198234), correlation (0.143636)*/,
12,-6, 12,-1/*mean (0.0676226), correlation (0.16689)*/,
-3,6, -2,12/*mean (0.166847), correlation (0.171682)*/,
-6,-13, -4,-8/*mean (0.101215), correlation (0.179716)*/,
11,-13, 12,-8/*mean (0.200641), correlation (0.192279)*/,
4,7, 5,1/*mean (0.205106), correlation (0.186848)*/,
5,-3, 10,-3/*mean (0.234908), correlation (0.192319)*/,
3,-7, 6,12/*mean (0.0709964), correlation (0.210872)*/,
-8,-7, -6,-2/*mean (0.0939834), correlation (0.212589)*/,
-2,11, -1,-10/*mean (0.127778), correlation (0.20866)*/,
-13,12, -8,10/*mean (0.14783), correlation (0.206356)*/,
-7,3, -5,-3/*mean (0.182141), correlation (0.198942)*/,
-4,2, -3,7/*mean (0.188237), correlation (0.21384)*/,
-10,-12, -6,11/*mean (0.14865), correlation (0.23571)*/,
5,-12, 6,-7/*mean (0.222312), correlation (0.23324)*/,
5,-6, 7,-1/*mean (0.229082), correlation (0.23389)*/,
1,0, 4,-5/*mean (0.241577), correlation (0.215286)*/,
9,11, 11,-13/*mean (0.00338507), correlation (0.251373)*/,
4,7, 4,12/*mean (0.131005), correlation (0.257622)*/,
2,-1, 4,4/*mean (0.152755), correlation (0.255205)*/,
-4,-12, -2,7/*mean (0.182771), correlation (0.244867)*/,
-8,-5, -7,-10/*mean (0.186898), correlation (0.23901)*/,
4,11, 9,12/*mean (0.226226), correlation (0.258255)*/,
0,-8, 1,-13/*mean (0.0897886), correlation (0.274827)*/,
-13,-2, -8,2/*mean (0.148774), correlation (0.28065)*/,
-3,-2, -2,3/*mean (0.153048), correlation (0.283063)*/,
-6,9, -4,-9/*mean (0.169523), correlation (0.278248)*/,
8,12, 10,7/*mean (0.225337), correlation (0.282851)*/,
0,9, 1,3/*mean (0.226687), correlation (0.278734)*/,
7,-5, 11,-10/*mean (0.00693882), correlation (0.305161)*/,
-13,-6, -11,0/*mean (0.0227283), correlation (0.300181)*/,
10,7, 12,1/*mean (0.125517), correlation (0.31089)*/,
-6,-3, -6,12/*mean (0.131748), correlation (0.312779)*/,
10,-9, 12,-4/*mean (0.144827), correlation (0.292797)*/,
-13,8, -8,-12/*mean (0.149202), correlation (0.308918)*/,
-13,0, -8,-4/*mean (0.160909), correlation (0.310013)*/,
3,3, 7,8/*mean (0.177755), correlation (0.309394)*/,
5,7, 10,-7/*mean (0.212337), correlation (0.310315)*/,
-1,7, 1,-12/*mean (0.214429), correlation (0.311933)*/,
3,-10, 5,6/*mean (0.235807), correlation (0.313104)*/,
2,-4, 3,-10/*mean (0.00494827), correlation (0.344948)*/,
-13,0, -13,5/*mean (0.0549145), correlation (0.344675)*/,
-13,-7, -12,12/*mean (0.103385), correlation (0.342715)*/,
-13,3, -11,8/*mean (0.134222), correlation (0.322922)*/,
-7,12, -4,7/*mean (0.153284), correlation (0.337061)*/,
6,-10, 12,8/*mean (0.154881), correlation (0.329257)*/,
-9,-1, -7,-6/*mean (0.200967), correlation (0.33312)*/,
-2,-5, 0,12/*mean (0.201518), correlation (0.340635)*/,
-12,5, -7,5/*mean (0.207805), correlation (0.335631)*/,
3,-10, 8,-13/*mean (0.224438), correlation (0.34504)*/,
-7,-7, -4,5/*mean (0.239361), correlation (0.338053)*/,
-3,-2, -1,-7/*mean (0.240744), correlation (0.344322)*/,
2,9, 5,-11/*mean (0.242949), correlation (0.34145)*/,
-11,-13, -5,-13/*mean (0.244028), correlation (0.336861)*/,
-1,6, 0,-1/*mean (0.247571), correlation (0.343684)*/,
5,-3, 5,2/*mean (0.000697256), correlation (0.357265)*/,
-4,-13, -4,12/*mean (0.00213675), correlation (0.373827)*/,
-9,-6, -9,6/*mean (0.0126856), correlation (0.373938)*/,
-12,-10, -8,-4/*mean (0.0152497), correlation (0.364237)*/,
10,2, 12,-3/*mean (0.0299933), correlation (0.345292)*/,
7,12, 12,12/*mean (0.0307242), correlation (0.366299)*/,
-7,-13, -6,5/*mean (0.0534975), correlation (0.368357)*/,
-4,9, -3,4/*mean (0.099865), correlation (0.372276)*/,
7,-1, 12,2/*mean (0.117083), correlation (0.364529)*/,
-7,6, -5,1/*mean (0.126125), correlation (0.369606)*/,
-13,11, -12,5/*mean (0.130364), correlation (0.358502)*/,
-3,7, -2,-6/*mean (0.131691), correlation (0.375531)*/,
7,-8, 12,-7/*mean (0.160166), correlation (0.379508)*/,
-13,-7, -11,-12/*mean (0.167848), correlation (0.353343)*/,
1,-3, 12,12/*mean (0.183378), correlation (0.371916)*/,
2,-6, 3,0/*mean (0.228711), correlation (0.371761)*/,
-4,3, -2,-13/*mean (0.247211), correlation (0.364063)*/,
-1,-13, 1,9/*mean (0.249325), correlation (0.378139)*/,
7,1, 8,-6/*mean (0.000652272), correlation (0.411682)*/,
1,-1, 3,12/*mean (0.00248538), correlation (0.392988)*/,
9,1, 12,6/*mean (0.0206815), correlation (0.386106)*/,
-1,-9, -1,3/*mean (0.0364485), correlation (0.410752)*/,
-13,-13, -10,5/*mean (0.0376068), correlation (0.398374)*/,
7,7, 10,12/*mean (0.0424202), correlation (0.405663)*/,
12,-5, 12,9/*mean (0.0942645), correlation (0.410422)*/,
6,3, 7,11/*mean (0.1074), correlation (0.413224)*/,
5,-13, 6,10/*mean (0.109256), correlation (0.408646)*/,
2,-12, 2,3/*mean (0.131691), correlation (0.416076)*/,
3,8, 4,-6/*mean (0.165081), correlation (0.417569)*/,
2,6, 12,-13/*mean (0.171874), correlation (0.408471)*/,
9,-12, 10,3/*mean (0.175146), correlation (0.41296)*/,
-8,4, -7,9/*mean (0.183682), correlation (0.402956)*/,
-11,12, -4,-6/*mean (0.184672), correlation (0.416125)*/,
1,12, 2,-8/*mean (0.191487), correlation (0.386696)*/,
6,-9, 7,-4/*mean (0.192668), correlation (0.394771)*/,
2,3, 3,-2/*mean (0.200157), correlation (0.408303)*/,
6,3, 11,0/*mean (0.204588), correlation (0.411762)*/,
3,-3, 8,-8/*mean (0.205904), correlation (0.416294)*/,
7,8, 9,3/*mean (0.213237), correlation (0.409306)*/,
-11,-5, -6,-4/*mean (0.243444), correlation (0.395069)*/,
-10,11, -5,10/*mean (0.247672), correlation (0.413392)*/,
-5,-8, -3,12/*mean (0.24774), correlation (0.411416)*/,
-10,5, -9,0/*mean (0.00213675), correlation (0.454003)*/,
8,-1, 12,-6/*mean (0.0293635), correlation (0.455368)*/,
4,-6, 6,-11/*mean (0.0404971), correlation (0.457393)*/,
-10,12, -8,7/*mean (0.0481107), correlation (0.448364)*/,
4,-2, 6,7/*mean (0.050641), correlation (0.455019)*/,
-2,0, -2,12/*mean (0.0525978), correlation (0.44338)*/,
-5,-8, -5,2/*mean (0.0629667), correlation (0.457096)*/,
7,-6, 10,12/*mean (0.0653846), correlation (0.445623)*/,
-9,-13, -8,-8/*mean (0.0858749), correlation (0.449789)*/,
-5,-13, -5,-2/*mean (0.122402), correlation (0.450201)*/,
8,-8, 9,-13/*mean (0.125416), correlation (0.453224)*/,
-9,-11, -9,0/*mean (0.130128), correlation (0.458724)*/,
1,-8, 1,-2/*mean (0.132467), correlation (0.440133)*/,
7,-4, 9,1/*mean (0.132692), correlation (0.454)*/,
-2,1, -1,-4/*mean (0.135695), correlation (0.455739)*/,
11,-6, 12,-11/*mean (0.142904), correlation (0.446114)*/,
-12,-9, -6,4/*mean (0.146165), correlation (0.451473)*/,
3,7, 7,12/*mean (0.147627), correlation (0.456643)*/,
5,5, 10,8/*mean (0.152901), correlation (0.455036)*/,
0,-4, 2,8/*mean (0.167083), correlation (0.459315)*/,
-9,12, -5,-13/*mean (0.173234), correlation (0.454706)*/,
0,7, 2,12/*mean (0.18312), correlation (0.433855)*/,
-1,2, 1,7/*mean (0.185504), correlation (0.443838)*/,
5,11, 7,-9/*mean (0.185706), correlation (0.451123)*/,
3,5, 6,-8/*mean (0.188968), correlation (0.455808)*/,
-13,-4, -8,9/*mean (0.191667), correlation (0.459128)*/,
-5,9, -3,-3/*mean (0.193196), correlation (0.458364)*/,
-4,-7, -3,-12/*mean (0.196536), correlation (0.455782)*/,
6,5, 8,0/*mean (0.1972), correlation (0.450481)*/,
-7,6, -6,12/*mean (0.199438), correlation (0.458156)*/,
-13,6, -5,-2/*mean (0.211224), correlation (0.449548)*/,
1,-10, 3,10/*mean (0.211718), correlation (0.440606)*/,
4,1, 8,-4/*mean (0.213034), correlation (0.443177)*/,
-2,-2, 2,-13/*mean (0.234334), correlation (0.455304)*/,
2,-12, 12,12/*mean (0.235684), correlation (0.443436)*/,
-2,-13, 0,-6/*mean (0.237674), correlation (0.452525)*/,
4,1, 9,3/*mean (0.23962), correlation (0.444824)*/,
-6,-10, -3,-5/*mean (0.248459), correlation (0.439621)*/,
-3,-13, -1,1/*mean (0.249505), correlation (0.456666)*/,
7,5, 12,-11/*mean (0.00119208), correlation (0.495466)*/,
4,-2, 5,-7/*mean (0.00372245), correlation (0.484214)*/,
-13,9, -9,-5/*mean (0.00741116), correlation (0.499854)*/,
7,1, 8,6/*mean (0.0208952), correlation (0.499773)*/,
7,-8, 7,6/*mean (0.0220085), correlation (0.501609)*/,
-7,-4, -7,1/*mean (0.0233806), correlation (0.496568)*/,
-8,11, -7,-8/*mean (0.0236505), correlation (0.489719)*/,
-13,6, -12,-8/*mean (0.0268781), correlation (0.503487)*/,
2,4, 3,9/*mean (0.0323324), correlation (0.501938)*/,
10,-5, 12,3/*mean (0.0399235), correlation (0.494029)*/,
-6,-5, -6,7/*mean (0.0420153), correlation (0.486579)*/,
8,-3, 9,-8/*mean (0.0548021), correlation (0.484237)*/,
2,-12, 2,8/*mean (0.0616622), correlation (0.496642)*/,
-11,-2, -10,3/*mean (0.0627755), correlation (0.498563)*/,
-12,-13, -7,-9/*mean (0.0829622), correlation (0.495491)*/,
-11,0, -10,-5/*mean (0.0843342), correlation (0.487146)*/,
5,-3, 11,8/*mean (0.0929937), correlation (0.502315)*/,
-2,-13, -1,12/*mean (0.113327), correlation (0.48941)*/,
-1,-8, 0,9/*mean (0.132119), correlation (0.467268)*/,
-13,-11, -12,-5/*mean (0.136269), correlation (0.498771)*/,
-10,-2, -10,11/*mean (0.142173), correlation (0.498714)*/,
-3,9, -2,-13/*mean (0.144141), correlation (0.491973)*/,
2,-3, 3,2/*mean (0.14892), correlation (0.500782)*/,
-9,-13, -4,0/*mean (0.150371), correlation (0.498211)*/,
-4,6, -3,-10/*mean (0.152159), correlation (0.495547)*/,
-4,12, -2,-7/*mean (0.156152), correlation (0.496925)*/,
-6,-11, -4,9/*mean (0.15749), correlation (0.499222)*/,
6,-3, 6,11/*mean (0.159211), correlation (0.503821)*/,
-13,11, -5,5/*mean (0.162427), correlation (0.501907)*/,
11,11, 12,6/*mean (0.16652), correlation (0.497632)*/,
7,-5, 12,-2/*mean (0.169141), correlation (0.484474)*/,
-1,12, 0,7/*mean (0.169456), correlation (0.495339)*/,
-4,-8, -3,-2/*mean (0.171457), correlation (0.487251)*/,
-7,1, -6,7/*mean (0.175), correlation (0.500024)*/,
-13,-12, -8,-13/*mean (0.175866), correlation (0.497523)*/,
-7,-2, -6,-8/*mean (0.178273), correlation (0.501854)*/,
-8,5, -6,-9/*mean (0.181107), correlation (0.494888)*/,
-5,-1, -4,5/*mean (0.190227), correlation (0.482557)*/,
-13,7, -8,10/*mean (0.196739), correlation (0.496503)*/,
1,5, 5,-13/*mean (0.19973), correlation (0.499759)*/,
1,0, 10,-13/*mean (0.204465), correlation (0.49873)*/,
9,12, 10,-1/*mean (0.209334), correlation (0.49063)*/,
5,-8, 10,-9/*mean (0.211134), correlation (0.503011)*/,
-1,11, 1,-13/*mean (0.212), correlation (0.499414)*/,
-9,-3, -6,2/*mean (0.212168), correlation (0.480739)*/,
-1,-10, 1,12/*mean (0.212731), correlation (0.502523)*/,
-13,1, -8,-10/*mean (0.21327), correlation (0.489786)*/,
8,-11, 10,-6/*mean (0.214159), correlation (0.488246)*/,
2,-13, 3,-6/*mean (0.216993), correlation (0.50287)*/,
7,-13, 12,-9/*mean (0.223639), correlation (0.470502)*/,
-10,-10, -5,-7/*mean (0.224089), correlation (0.500852)*/,
-10,-8, -8,-13/*mean (0.228666), correlation (0.502629)*/,
4,-6, 8,5/*mean (0.22906), correlation (0.498305)*/,
3,12, 8,-13/*mean (0.233378), correlation (0.503825)*/,
-4,2, -3,-3/*mean (0.234323), correlation (0.476692)*/,
5,-13, 10,-12/*mean (0.236392), correlation (0.475462)*/,
4,-13, 5,-1/*mean (0.236842), correlation (0.504132)*/,
-9,9, -4,3/*mean (0.236977), correlation (0.497739)*/,
0,3, 3,-9/*mean (0.24314), correlation (0.499398)*/,
-12,1, -6,1/*mean (0.243297), correlation (0.489447)*/,
3,2, 4,-8/*mean (0.00155196), correlation (0.553496)*/,
-10,-10, -10,9/*mean (0.00239541), correlation (0.54297)*/,
8,-13, 12,12/*mean (0.0034413), correlation (0.544361)*/,
-8,-12, -6,-5/*mean (0.003565), correlation (0.551225)*/,
2,2, 3,7/*mean (0.00835583), correlation (0.55285)*/,
10,6, 11,-8/*mean (0.00885065), correlation (0.540913)*/,
6,8, 8,-12/*mean (0.0101552), correlation (0.551085)*/,
-7,10, -6,5/*mean (0.0102227), correlation (0.533635)*/,
-3,-9, -3,9/*mean (0.0110211), correlation (0.543121)*/,
-1,-13, -1,5/*mean (0.0113473), correlation (0.550173)*/,
-3,-7, -3,4/*mean (0.0140913), correlation (0.554774)*/,
-8,-2, -8,3/*mean (0.017049), correlation (0.55461)*/,
4,2, 12,12/*mean (0.01778), correlation (0.546921)*/,
2,-5, 3,11/*mean (0.0224022), correlation (0.549667)*/,
6,-9, 11,-13/*mean (0.029161), correlation (0.546295)*/,
3,-1, 7,12/*mean (0.0303081), correlation (0.548599)*/,
11,-1, 12,4/*mean (0.0355151), correlation (0.523943)*/,
-3,0, -3,6/*mean (0.0417904), correlation (0.543395)*/,
4,-11, 4,12/*mean (0.0487292), correlation (0.542818)*/,
2,-4, 2,1/*mean (0.0575124), correlation (0.554888)*/,
-10,-6, -8,1/*mean (0.0594242), correlation (0.544026)*/,
-13,7, -11,1/*mean (0.0597391), correlation (0.550524)*/,
-13,12, -11,-13/*mean (0.0608974), correlation (0.55383)*/,
6,0, 11,-13/*mean (0.065126), correlation (0.552006)*/,
0,-1, 1,4/*mean (0.074224), correlation (0.546372)*/,
-13,3, -9,-2/*mean (0.0808592), correlation (0.554875)*/,
-9,8, -6,-3/*mean (0.0883378), correlation (0.551178)*/,
-13,-6, -8,-2/*mean (0.0901035), correlation (0.548446)*/,
5,-9, 8,10/*mean (0.0949843), correlation (0.554694)*/,
2,7, 3,-9/*mean (0.0994152), correlation (0.550979)*/,
-1,-6, -1,-1/*mean (0.10045), correlation (0.552714)*/,
9,5, 11,-2/*mean (0.100686), correlation (0.552594)*/,
11,-3, 12,-8/*mean (0.101091), correlation (0.532394)*/,
3,0, 3,5/*mean (0.101147), correlation (0.525576)*/,
-1,4, 0,10/*mean (0.105263), correlation (0.531498)*/,
3,-6, 4,5/*mean (0.110785), correlation (0.540491)*/,
-13,0, -10,5/*mean (0.112798), correlation (0.536582)*/,
5,8, 12,11/*mean (0.114181), correlation (0.555793)*/,
8,9, 9,-6/*mean (0.117431), correlation (0.553763)*/,
7,-4, 8,-12/*mean (0.118522), correlation (0.553452)*/,
-10,4, -10,9/*mean (0.12094), correlation (0.554785)*/,
7,3, 12,4/*mean (0.122582), correlation (0.555825)*/,
9,-7, 10,-2/*mean (0.124978), correlation (0.549846)*/,
7,0, 12,-2/*mean (0.127002), correlation (0.537452)*/,
-1,-6, 0,-11/*mean (0.127148), correlation (0.547401)*/
};
void initializeOrbPattern(const Point* pattern0, Mat& pattern, int ntuples, int tupleSize, int poolSize)
{
RNG rng(0x12345678);
pattern.create(2, ntuples * tupleSize, CV_32SC1);
pattern.setTo(Scalar::all(0));
int* pattern_x_ptr = pattern.ptr<int>(0);
int* pattern_y_ptr = pattern.ptr<int>(1);
for (int i = 0; i < ntuples; i++)
{
for (int k = 0; k < tupleSize; k++)
{
for(;;)
{
int idx = rng.uniform(0, poolSize);
Point pt = pattern0[idx];
int k1;
for (k1 = 0; k1 < k; k1++)
if (pattern_x_ptr[tupleSize * i + k1] == pt.x && pattern_y_ptr[tupleSize * i + k1] == pt.y)
break;
if (k1 == k)
{
pattern_x_ptr[tupleSize * i + k] = pt.x;
pattern_y_ptr[tupleSize * i + k] = pt.y;
break;
}
}
}
}
}
void makeRandomPattern(int patchSize, Point* pattern, int npoints)
{
// we always start with a fixed seed,
// to make patterns the same on each run
RNG rng(0x34985739);
for (int i = 0; i < npoints; i++)
{
pattern[i].x = rng.uniform(-patchSize / 2, patchSize / 2 + 1);
pattern[i].y = rng.uniform(-patchSize / 2, patchSize / 2 + 1);
}
}
}
cv::gpu::ORB_GPU::ORB_GPU(int nFeatures, float scaleFactor, int nLevels, int edgeThreshold, int firstLevel, int WTA_K, int scoreType, int patchSize) :
nFeatures_(nFeatures), scaleFactor_(scaleFactor), nLevels_(nLevels), edgeThreshold_(edgeThreshold), firstLevel_(firstLevel), WTA_K_(WTA_K),
scoreType_(scoreType), patchSize_(patchSize),
fastDetector_(DEFAULT_FAST_THRESHOLD)
{
CV_Assert(patchSize_ >= 2);
// fill the extractors and descriptors for the corresponding scales
float factor = 1.0f / scaleFactor_;
float n_desired_features_per_scale = nFeatures_ * (1.0f - factor) / (1.0f - std::pow(factor, nLevels_));
n_features_per_level_.resize(nLevels_);
size_t sum_n_features = 0;
for (int level = 0; level < nLevels_ - 1; ++level)
{
n_features_per_level_[level] = cvRound(n_desired_features_per_scale);
sum_n_features += n_features_per_level_[level];
n_desired_features_per_scale *= factor;
}
n_features_per_level_[nLevels_ - 1] = nFeatures - sum_n_features;
// pre-compute the end of a row in a circular patch
int half_patch_size = patchSize_ / 2;
std::vector<int> u_max(half_patch_size + 2);
for (int v = 0; v <= half_patch_size * std::sqrt(2.f) / 2 + 1; ++v)
u_max[v] = cvRound(std::sqrt(static_cast<float>(half_patch_size * half_patch_size - v * v)));
// Make sure we are symmetric
for (int v = half_patch_size, v_0 = 0; v >= half_patch_size * std::sqrt(2.f) / 2; --v)
{
while (u_max[v_0] == u_max[v_0 + 1])
++v_0;
u_max[v] = v_0;
++v_0;
}
CV_Assert(u_max.size() < 32);
cv::gpu::cudev::orb::loadUMax(&u_max[0], static_cast<int>(u_max.size()));
// Calc pattern
const int npoints = 512;
Point pattern_buf[npoints];
const Point* pattern0 = (const Point*)bit_pattern_31_;
if (patchSize_ != 31)
{
pattern0 = pattern_buf;
makeRandomPattern(patchSize_, pattern_buf, npoints);
}
CV_Assert(WTA_K_ == 2 || WTA_K_ == 3 || WTA_K_ == 4);
Mat h_pattern;
if (WTA_K_ == 2)
{
h_pattern.create(2, npoints, CV_32SC1);
int* pattern_x_ptr = h_pattern.ptr<int>(0);
int* pattern_y_ptr = h_pattern.ptr<int>(1);
for (int i = 0; i < npoints; ++i)
{
pattern_x_ptr[i] = pattern0[i].x;
pattern_y_ptr[i] = pattern0[i].y;
}
}
else
{
int ntuples = descriptorSize() * 4;
initializeOrbPattern(pattern0, h_pattern, ntuples, WTA_K_, npoints);
}
pattern_.upload(h_pattern);
blurFilter = createGaussianFilter_GPU(CV_8UC1, Size(7, 7), 2, 2, BORDER_REFLECT_101);
blurForDescriptor = false;
}
namespace
{
inline float getScale(float scaleFactor, int firstLevel, int level)
{
return pow(scaleFactor, level - firstLevel);
}
}
void cv::gpu::ORB_GPU::buildScalePyramids(const GpuMat& image, const GpuMat& mask)
{
CV_Assert(image.type() == CV_8UC1);
CV_Assert(mask.empty() || (mask.type() == CV_8UC1 && mask.size() == image.size()));
imagePyr_.resize(nLevels_);
maskPyr_.resize(nLevels_);
for (int level = 0; level < nLevels_; ++level)
{
float scale = 1.0f / getScale(scaleFactor_, firstLevel_, level);
Size sz(cvRound(image.cols * scale), cvRound(image.rows * scale));
ensureSizeIsEnough(sz, image.type(), imagePyr_[level]);
ensureSizeIsEnough(sz, CV_8UC1, maskPyr_[level]);
maskPyr_[level].setTo(Scalar::all(255));
// Compute the resized image
if (level != firstLevel_)
{
if (level < firstLevel_)
{
gpu::resize(image, imagePyr_[level], sz, 0, 0, INTER_LINEAR);
if (!mask.empty())
gpu::resize(mask, maskPyr_[level], sz, 0, 0, INTER_LINEAR);
}
else
{
gpu::resize(imagePyr_[level - 1], imagePyr_[level], sz, 0, 0, INTER_LINEAR);
if (!mask.empty())
{
gpu::resize(maskPyr_[level - 1], maskPyr_[level], sz, 0, 0, INTER_LINEAR);
gpu::threshold(maskPyr_[level], maskPyr_[level], 254, 0, THRESH_TOZERO);
}
}
}
else
{
image.copyTo(imagePyr_[level]);
if (!mask.empty())
mask.copyTo(maskPyr_[level]);
}
// Filter keypoints by image border
ensureSizeIsEnough(sz, CV_8UC1, buf_);
buf_.setTo(Scalar::all(0));
Rect inner(edgeThreshold_, edgeThreshold_, sz.width - 2 * edgeThreshold_, sz.height - 2 * edgeThreshold_);
buf_(inner).setTo(Scalar::all(255));
gpu::bitwise_and(maskPyr_[level], buf_, maskPyr_[level]);
}
}
namespace
{
//takes keypoints and culls them by the response
void cull(GpuMat& keypoints, int& count, int n_points)
{
using namespace cv::gpu::cudev::orb;
//this is only necessary if the keypoints size is greater than the number of desired points.
if (count > n_points)
{
if (n_points == 0)
{
keypoints.release();
return;
}
count = cull_gpu(keypoints.ptr<int>(FAST_GPU::LOCATION_ROW), keypoints.ptr<float>(FAST_GPU::RESPONSE_ROW), count, n_points);
}
}
}
void cv::gpu::ORB_GPU::computeKeyPointsPyramid()
{
using namespace cv::gpu::cudev::orb;
int half_patch_size = patchSize_ / 2;
keyPointsPyr_.resize(nLevels_);
keyPointsCount_.resize(nLevels_);
for (int level = 0; level < nLevels_; ++level)
{
keyPointsCount_[level] = fastDetector_.calcKeyPointsLocation(imagePyr_[level], maskPyr_[level]);
if (keyPointsCount_[level] == 0)
continue;
ensureSizeIsEnough(3, keyPointsCount_[level], CV_32FC1, keyPointsPyr_[level]);
GpuMat fastKpRange = keyPointsPyr_[level].rowRange(0, 2);
keyPointsCount_[level] = fastDetector_.getKeyPoints(fastKpRange);
if (keyPointsCount_[level] == 0)
continue;
int n_features = static_cast<int>(n_features_per_level_[level]);
if (scoreType_ == ORB::HARRIS_SCORE)
{
// Keep more points than necessary as FAST does not give amazing corners
cull(keyPointsPyr_[level], keyPointsCount_[level], 2 * n_features);
// Compute the Harris cornerness (better scoring than FAST)
HarrisResponses_gpu(imagePyr_[level], keyPointsPyr_[level].ptr<short2>(0), keyPointsPyr_[level].ptr<float>(1), keyPointsCount_[level], 7, HARRIS_K, 0);
}
//cull to the final desired level, using the new Harris scores or the original FAST scores.
cull(keyPointsPyr_[level], keyPointsCount_[level], n_features);
// Compute orientation
IC_Angle_gpu(imagePyr_[level], keyPointsPyr_[level].ptr<short2>(0), keyPointsPyr_[level].ptr<float>(2), keyPointsCount_[level], half_patch_size, 0);
}
}
void cv::gpu::ORB_GPU::computeDescriptors(GpuMat& descriptors)
{
using namespace cv::gpu::cudev::orb;
int nAllkeypoints = 0;
for (int level = 0; level < nLevels_; ++level)
nAllkeypoints += keyPointsCount_[level];
if (nAllkeypoints == 0)
{
descriptors.release();
return;
}
ensureSizeIsEnough(nAllkeypoints, descriptorSize(), CV_8UC1, descriptors);
int offset = 0;
for (int level = 0; level < nLevels_; ++level)
{
if (keyPointsCount_[level] == 0)
continue;
GpuMat descRange = descriptors.rowRange(offset, offset + keyPointsCount_[level]);
if (blurForDescriptor)
{
// preprocess the resized image
ensureSizeIsEnough(imagePyr_[level].size(), imagePyr_[level].type(), buf_);
blurFilter->apply(imagePyr_[level], buf_, Rect(0, 0, imagePyr_[level].cols, imagePyr_[level].rows));
}
computeOrbDescriptor_gpu(blurForDescriptor ? buf_ : imagePyr_[level], keyPointsPyr_[level].ptr<short2>(0), keyPointsPyr_[level].ptr<float>(2),
keyPointsCount_[level], pattern_.ptr<int>(0), pattern_.ptr<int>(1), descRange, descriptorSize(), WTA_K_, 0);
offset += keyPointsCount_[level];
}
}
void cv::gpu::ORB_GPU::mergeKeyPoints(GpuMat& keypoints)
{
using namespace cv::gpu::cudev::orb;
int nAllkeypoints = 0;
for (int level = 0; level < nLevels_; ++level)
nAllkeypoints += keyPointsCount_[level];
if (nAllkeypoints == 0)
{
keypoints.release();
return;
}
ensureSizeIsEnough(ROWS_COUNT, nAllkeypoints, CV_32FC1, keypoints);
int offset = 0;
for (int level = 0; level < nLevels_; ++level)
{
if (keyPointsCount_[level] == 0)
continue;
float sf = getScale(scaleFactor_, firstLevel_, level);
GpuMat keyPointsRange = keypoints.colRange(offset, offset + keyPointsCount_[level]);
float locScale = level != firstLevel_ ? sf : 1.0f;
mergeLocation_gpu(keyPointsPyr_[level].ptr<short2>(0), keyPointsRange.ptr<float>(0), keyPointsRange.ptr<float>(1), keyPointsCount_[level], locScale, 0);
GpuMat range = keyPointsRange.rowRange(2, 4);
keyPointsPyr_[level](Range(1, 3), Range(0, keyPointsCount_[level])).copyTo(range);
keyPointsRange.row(4).setTo(Scalar::all(level));
keyPointsRange.row(5).setTo(Scalar::all(patchSize_ * sf));
offset += keyPointsCount_[level];
}
}
void cv::gpu::ORB_GPU::downloadKeyPoints(const GpuMat &d_keypoints, std::vector<KeyPoint>& keypoints)
{
if (d_keypoints.empty())
{
keypoints.clear();
return;
}
Mat h_keypoints(d_keypoints);
convertKeyPoints(h_keypoints, keypoints);
}
void cv::gpu::ORB_GPU::convertKeyPoints(const Mat &d_keypoints, std::vector<KeyPoint>& keypoints)
{
if (d_keypoints.empty())
{
keypoints.clear();
return;
}
CV_Assert(d_keypoints.type() == CV_32FC1 && d_keypoints.rows == ROWS_COUNT);
const float* x_ptr = d_keypoints.ptr<float>(X_ROW);
const float* y_ptr = d_keypoints.ptr<float>(Y_ROW);
const float* response_ptr = d_keypoints.ptr<float>(RESPONSE_ROW);
const float* angle_ptr = d_keypoints.ptr<float>(ANGLE_ROW);
const float* octave_ptr = d_keypoints.ptr<float>(OCTAVE_ROW);
const float* size_ptr = d_keypoints.ptr<float>(SIZE_ROW);
keypoints.resize(d_keypoints.cols);
for (int i = 0; i < d_keypoints.cols; ++i)
{
KeyPoint kp;
kp.pt.x = x_ptr[i];
kp.pt.y = y_ptr[i];
kp.response = response_ptr[i];
kp.angle = angle_ptr[i];
kp.octave = static_cast<int>(octave_ptr[i]);
kp.size = size_ptr[i];
keypoints[i] = kp;
}
}
void cv::gpu::ORB_GPU::operator()(const GpuMat& image, const GpuMat& mask, GpuMat& keypoints)
{
buildScalePyramids(image, mask);
computeKeyPointsPyramid();
mergeKeyPoints(keypoints);
}
void cv::gpu::ORB_GPU::operator()(const GpuMat& image, const GpuMat& mask, GpuMat& keypoints, GpuMat& descriptors)
{
buildScalePyramids(image, mask);
computeKeyPointsPyramid();
computeDescriptors(descriptors);
mergeKeyPoints(keypoints);
}
void cv::gpu::ORB_GPU::operator()(const GpuMat& image, const GpuMat& mask, std::vector<KeyPoint>& keypoints)
{
(*this)(image, mask, d_keypoints_);
downloadKeyPoints(d_keypoints_, keypoints);
}
void cv::gpu::ORB_GPU::operator()(const GpuMat& image, const GpuMat& mask, std::vector<KeyPoint>& keypoints, GpuMat& descriptors)
{
(*this)(image, mask, d_keypoints_, descriptors);
downloadKeyPoints(d_keypoints_, keypoints);
}
void cv::gpu::ORB_GPU::release()
{
imagePyr_.clear();
maskPyr_.clear();
buf_.release();
keyPointsPyr_.clear();
fastDetector_.release();
d_keypoints_.release();
}
#endif /* !defined (HAVE_CUDA) */

View File

@@ -0,0 +1,43 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"

View File

@@ -0,0 +1,57 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __OPENCV_PRECOMP_H__
#define __OPENCV_PRECOMP_H__
#include <algorithm>
#include <functional>
#include <iterator>
#include "opencv2/gpufeatures2d.hpp"
#include "opencv2/gpuarithm.hpp"
#include "opencv2/gpuwarping.hpp"
#include "opencv2/features2d.hpp"
#include "opencv2/core/gpu_private.hpp"
#endif /* __OPENCV_PRECOMP_H__ */