optimizations:

- new reduce implementation (with kepler optimizations)
- saturate_cast via asm command
- video SIMD instructions in element operations
- float arithmetics instead of double
- new deviceSupports function
This commit is contained in:
Vladislav Vinogradov
2013-01-23 14:43:36 +04:00
parent ae6266e101
commit 281d036fcf
45 changed files with 9379 additions and 8846 deletions

View File

@@ -68,11 +68,16 @@ void cv::gpu::polarToCart(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool,
void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const GpuMat& src3, double beta, GpuMat& dst, int flags, Stream& stream)
{
#ifndef HAVE_CUBLAS
(void)src1; (void)src2; (void)alpha; (void)src3; (void)beta; (void)dst; (void)flags; (void)stream;
(void)src1;
(void)src2;
(void)alpha;
(void)src3;
(void)beta;
(void)dst;
(void)flags;
(void)stream;
CV_Error(CV_StsNotImplemented, "The library was build without CUBLAS");
#else
// CUBLAS works with column-major matrices
CV_Assert(src1.type() == CV_32FC1 || src1.type() == CV_32FC2 || src1.type() == CV_64FC1 || src1.type() == CV_64FC2);
@@ -80,7 +85,7 @@ void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const G
if (src1.depth() == CV_64F)
{
if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
if (!deviceSupports(NATIVE_DOUBLE))
CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
}
@@ -188,7 +193,6 @@ void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const G
}
cublasSafeCall( cublasDestroy_v2(handle) );
#endif
}
@@ -227,7 +231,7 @@ void cv::gpu::transpose(const GpuMat& src, GpuMat& dst, Stream& s)
}
else // if (src.elemSize() == 8)
{
if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
if (!deviceSupports(NATIVE_DOUBLE))
CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
NppStStreamHandler h(stream);

View File

@@ -88,71 +88,71 @@ namespace cv { namespace gpu { namespace device
{
template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
int cc, cudaStream_t stream);
cudaStream_t stream);
template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
int cc, cudaStream_t stream);
cudaStream_t stream);
template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
int cc, cudaStream_t stream);
cudaStream_t stream);
template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
int cc, cudaStream_t stream);
cudaStream_t stream);
template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
int cc, cudaStream_t stream);
cudaStream_t stream);
template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
int cc, cudaStream_t stream);
cudaStream_t stream);
}
namespace bf_knnmatch
{
template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
int cc, cudaStream_t stream);
cudaStream_t stream);
template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
int cc, cudaStream_t stream);
cudaStream_t stream);
template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
int cc, cudaStream_t stream);
cudaStream_t stream);
template <typename T> void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
int cc, cudaStream_t stream);
cudaStream_t stream);
template <typename T> void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
int cc, cudaStream_t stream);
cudaStream_t stream);
template <typename T> void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
int cc, cudaStream_t stream);
cudaStream_t stream);
}
namespace bf_radius_match
{
template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
int cc, cudaStream_t stream);
cudaStream_t stream);
template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
int cc, cudaStream_t stream);
cudaStream_t stream);
template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
int cc, cudaStream_t stream);
cudaStream_t stream);
template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
int cc, cudaStream_t stream);
cudaStream_t stream);
template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
int cc, cudaStream_t stream);
cudaStream_t stream);
template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
int cc, cudaStream_t stream);
cudaStream_t stream);
}
}}}
@@ -198,11 +198,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& query, const
if (query.empty() || train.empty())
return;
using namespace ::cv::gpu::device::bf_match;
using namespace cv::gpu::device::bf_match;
typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
int cc, cudaStream_t stream);
cudaStream_t stream);
static const caller_t callers[3][6] =
{
@@ -234,10 +234,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& query, const
caller_t func = callers[distType][query.depth()];
CV_Assert(func != 0);
DeviceInfo info;
int cc = info.majorVersion() * 10 + info.minorVersion();
func(query, train, mask, trainIdx, distance, cc, StreamAccessor::getStream(stream));
func(query, train, mask, trainIdx, distance, StreamAccessor::getStream(stream));
}
void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& distance, vector<DMatch>& matches)
@@ -340,11 +337,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& query, c
if (query.empty() || trainCollection.empty())
return;
using namespace ::cv::gpu::device::bf_match;
using namespace cv::gpu::device::bf_match;
typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
int cc, cudaStream_t stream);
cudaStream_t stream);
static const caller_t callers[3][6] =
{
@@ -376,10 +373,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& query, c
caller_t func = callers[distType][query.depth()];
CV_Assert(func != 0);
DeviceInfo info;
int cc = info.majorVersion() * 10 + info.minorVersion();
func(query, trainCollection, masks, trainIdx, imgIdx, distance, cc, StreamAccessor::getStream(stream));
func(query, trainCollection, masks, trainIdx, imgIdx, distance, StreamAccessor::getStream(stream));
}
void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, vector<DMatch>& matches)
@@ -451,11 +445,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchSingle(const GpuMat& query, co
if (query.empty() || train.empty())
return;
using namespace ::cv::gpu::device::bf_knnmatch;
using namespace cv::gpu::device::bf_knnmatch;
typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
int cc, cudaStream_t stream);
cudaStream_t stream);
static const caller_t callers[3][6] =
{
@@ -502,10 +496,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchSingle(const GpuMat& query, co
caller_t func = callers[distType][query.depth()];
CV_Assert(func != 0);
DeviceInfo info;
int cc = info.majorVersion() * 10 + info.minorVersion();
func(query, train, k, mask, trainIdx, distance, allDist, cc, StreamAccessor::getStream(stream));
func(query, train, k, mask, trainIdx, distance, allDist, StreamAccessor::getStream(stream));
}
void cv::gpu::BruteForceMatcher_GPU_base::knnMatchDownload(const GpuMat& trainIdx, const GpuMat& distance,
@@ -580,11 +571,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Collection(const GpuMat& quer
if (query.empty() || trainCollection.empty())
return;
using namespace ::cv::gpu::device::bf_knnmatch;
using namespace cv::gpu::device::bf_knnmatch;
typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
int cc, cudaStream_t stream);
cudaStream_t stream);
static const caller_t callers[3][6] =
{
@@ -621,10 +612,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Collection(const GpuMat& quer
caller_t func = callers[distType][query.depth()];
CV_Assert(func != 0);
DeviceInfo info;
int cc = info.majorVersion() * 10 + info.minorVersion();
func(query, trainCollection, maskCollection, trainIdx, imgIdx, distance, cc, StreamAccessor::getStream(stream));
func(query, trainCollection, maskCollection, trainIdx, imgIdx, distance, StreamAccessor::getStream(stream));
}
void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Download(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance,
@@ -765,7 +753,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchSingle(const GpuMat& query,
typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
int cc, cudaStream_t stream);
cudaStream_t stream);
static const caller_t callers[3][6] =
{
@@ -786,12 +774,6 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchSingle(const GpuMat& query,
}
};
DeviceInfo info;
int cc = info.majorVersion() * 10 + info.minorVersion();
if (!TargetArchs::builtWith(GLOBAL_ATOMICS) || !DeviceInfo().supports(GLOBAL_ATOMICS))
CV_Error(CV_StsNotImplemented, "The device doesn't support global atomics");
const int nQuery = query.rows;
const int nTrain = train.rows;
@@ -814,7 +796,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchSingle(const GpuMat& query,
caller_t func = callers[distType][query.depth()];
CV_Assert(func != 0);
func(query, train, maxDistance, mask, trainIdx, distance, nMatches, cc, StreamAccessor::getStream(stream));
func(query, train, maxDistance, mask, trainIdx, distance, nMatches, StreamAccessor::getStream(stream));
}
void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& distance, const GpuMat& nMatches,
@@ -897,7 +879,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchCollection(const GpuMat& qu
typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
int cc, cudaStream_t stream);
cudaStream_t stream);
static const caller_t callers[3][6] =
{
@@ -918,12 +900,6 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchCollection(const GpuMat& qu
}
};
DeviceInfo info;
int cc = info.majorVersion() * 10 + info.minorVersion();
if (!TargetArchs::builtWith(GLOBAL_ATOMICS) || !DeviceInfo().supports(GLOBAL_ATOMICS))
CV_Error(CV_StsNotImplemented, "The device doesn't support global atomics");
const int nQuery = query.rows;
CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
@@ -949,7 +925,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchCollection(const GpuMat& qu
vector<PtrStepSzb> masks_(masks.begin(), masks.end());
func(query, &trains_[0], static_cast<int>(trains_.size()), maxDistance, masks_.size() == 0 ? 0 : &masks_[0],
trainIdx, imgIdx, distance, nMatches, cc, StreamAccessor::getStream(stream));
trainIdx, imgIdx, distance, nMatches, StreamAccessor::getStream(stream));
}
void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, const GpuMat& nMatches,

View File

@@ -42,10 +42,13 @@
#if !defined CUDA_DISABLER
#include "internal_shared.hpp"
#include "opencv2/gpu/device/common.hpp"
#include "opencv2/gpu/device/utility.hpp"
#include "opencv2/gpu/device/reduce.hpp"
#include "opencv2/gpu/device/limits.hpp"
#include "opencv2/gpu/device/vec_distance.hpp"
#include "opencv2/gpu/device/datamov_utils.hpp"
#include "opencv2/gpu/device/warp_shuffle.hpp"
namespace cv { namespace gpu { namespace device
{
@@ -59,6 +62,45 @@ namespace cv { namespace gpu { namespace device
int& bestTrainIdx1, int& bestTrainIdx2,
float* s_distance, int* s_trainIdx)
{
#if __CUDA_ARCH__ >= 300
(void) s_distance;
(void) s_trainIdx;
float d1, d2;
int i1, i2;
#pragma unroll
for (int i = BLOCK_SIZE / 2; i >= 1; i /= 2)
{
d1 = shfl_down(bestDistance1, i, BLOCK_SIZE);
d2 = shfl_down(bestDistance2, i, BLOCK_SIZE);
i1 = shfl_down(bestTrainIdx1, i, BLOCK_SIZE);
i2 = shfl_down(bestTrainIdx2, i, BLOCK_SIZE);
if (bestDistance1 < d1)
{
if (d1 < bestDistance2)
{
bestDistance2 = d1;
bestTrainIdx2 = i1;
}
}
else
{
bestDistance2 = bestDistance1;
bestTrainIdx2 = bestTrainIdx1;
bestDistance1 = d1;
bestTrainIdx1 = i1;
if (d2 < bestDistance2)
{
bestDistance2 = d2;
bestTrainIdx2 = i2;
}
}
}
#else
float myBestDistance1 = numeric_limits<float>::max();
float myBestDistance2 = numeric_limits<float>::max();
int myBestTrainIdx1 = -1;
@@ -122,6 +164,7 @@ namespace cv { namespace gpu { namespace device
bestTrainIdx1 = myBestTrainIdx1;
bestTrainIdx2 = myBestTrainIdx2;
#endif
}
template <int BLOCK_SIZE>
@@ -130,6 +173,53 @@ namespace cv { namespace gpu { namespace device
int& bestImgIdx1, int& bestImgIdx2,
float* s_distance, int* s_trainIdx, int* s_imgIdx)
{
#if __CUDA_ARCH__ >= 300
(void) s_distance;
(void) s_trainIdx;
(void) s_imgIdx;
float d1, d2;
int i1, i2;
int j1, j2;
#pragma unroll
for (int i = BLOCK_SIZE / 2; i >= 1; i /= 2)
{
d1 = shfl_down(bestDistance1, i, BLOCK_SIZE);
d2 = shfl_down(bestDistance2, i, BLOCK_SIZE);
i1 = shfl_down(bestTrainIdx1, i, BLOCK_SIZE);
i2 = shfl_down(bestTrainIdx2, i, BLOCK_SIZE);
j1 = shfl_down(bestImgIdx1, i, BLOCK_SIZE);
j2 = shfl_down(bestImgIdx2, i, BLOCK_SIZE);
if (bestDistance1 < d1)
{
if (d1 < bestDistance2)
{
bestDistance2 = d1;
bestTrainIdx2 = i1;
bestImgIdx2 = j1;
}
}
else
{
bestDistance2 = bestDistance1;
bestTrainIdx2 = bestTrainIdx1;
bestImgIdx2 = bestImgIdx1;
bestDistance1 = d1;
bestTrainIdx1 = i1;
bestImgIdx1 = j1;
if (d2 < bestDistance2)
{
bestDistance2 = d2;
bestTrainIdx2 = i2;
bestImgIdx2 = j2;
}
}
}
#else
float myBestDistance1 = numeric_limits<float>::max();
float myBestDistance2 = numeric_limits<float>::max();
int myBestTrainIdx1 = -1;
@@ -205,6 +295,7 @@ namespace cv { namespace gpu { namespace device
bestImgIdx1 = myBestImgIdx1;
bestImgIdx2 = myBestImgIdx2;
#endif
}
///////////////////////////////////////////////////////////////////////////////
@@ -748,9 +839,8 @@ namespace cv { namespace gpu { namespace device
template <typename Dist, typename T, typename Mask>
void match2Dispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
const PtrStepSzb& trainIdx, const PtrStepSzb& distance,
int cc, cudaStream_t stream)
cudaStream_t stream)
{
(void)cc;
if (query.cols <= 64)
{
matchUnrolledCached<16, 64, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
@@ -780,9 +870,8 @@ namespace cv { namespace gpu { namespace device
template <typename Dist, typename T, typename Mask>
void match2Dispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
int cc, cudaStream_t stream)
cudaStream_t stream)
{
(void)cc;
if (query.cols <= 64)
{
matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
@@ -945,9 +1034,8 @@ namespace cv { namespace gpu { namespace device
template <typename Dist, typename T, typename Mask>
void calcDistanceDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
const PtrStepSzf& allDist,
int cc, cudaStream_t stream)
cudaStream_t stream)
{
(void)cc;
if (query.cols <= 64)
{
calcDistanceUnrolled<16, 64, Dist>(query, train, mask, allDist, stream);
@@ -1005,7 +1093,7 @@ namespace cv { namespace gpu { namespace device
s_trainIdx[threadIdx.x] = bestIdx;
__syncthreads();
reducePredVal<BLOCK_SIZE>(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less<volatile float>());
reduceKeyVal<BLOCK_SIZE>(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less<float>());
if (threadIdx.x == 0)
{
@@ -1034,7 +1122,7 @@ namespace cv { namespace gpu { namespace device
cudaSafeCall( cudaDeviceSynchronize() );
}
void findKnnMatchDispatcher(int k, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream)
void findKnnMatchDispatcher(int k, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream)
{
findKnnMatch<256>(k, static_cast<PtrStepSzi>(trainIdx), static_cast<PtrStepSzf>(distance), allDist, stream);
}
@@ -1045,16 +1133,16 @@ namespace cv { namespace gpu { namespace device
template <typename Dist, typename T, typename Mask>
void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, int k, const Mask& mask,
const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
int cc, cudaStream_t stream)
cudaStream_t stream)
{
if (k == 2)
{
match2Dispatcher<Dist>(query, train, mask, trainIdx, distance, cc, stream);
match2Dispatcher<Dist>(query, train, mask, trainIdx, distance, stream);
}
else
{
calcDistanceDispatcher<Dist>(query, train, mask, allDist, cc, stream);
findKnnMatchDispatcher(k, trainIdx, distance, allDist, cc, stream);
calcDistanceDispatcher<Dist>(query, train, mask, allDist, stream);
findKnnMatchDispatcher(k, trainIdx, distance, allDist, stream);
}
}
@@ -1063,105 +1151,105 @@ namespace cv { namespace gpu { namespace device
template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
int cc, cudaStream_t stream)
cudaStream_t stream)
{
if (mask.data)
matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);
matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, stream);
else
matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);
matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, stream);
}
template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
//template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
template void matchL1_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
//template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
template void matchL1_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
int cc, cudaStream_t stream)
cudaStream_t stream)
{
if (mask.data)
matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);
matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, stream);
else
matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);
matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, stream);
}
//template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
//template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
//template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
//template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
//template void matchL2_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
//template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
//template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
//template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
//template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
//template void matchL2_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
int cc, cudaStream_t stream)
cudaStream_t stream)
{
if (mask.data)
matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);
matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, stream);
else
matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);
matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, stream);
}
template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
//template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
//template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
template void matchHamming_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
//template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
//template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
template void matchHamming_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
template <typename T> void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
int cc, cudaStream_t stream)
cudaStream_t stream)
{
if (masks.data)
match2Dispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);
match2Dispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, stream);
else
match2Dispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
match2Dispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, stream);
}
template void match2L1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
//template void match2L1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
template void match2L1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
template void match2L1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
template void match2L1_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
template void match2L1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
template void match2L1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
//template void match2L1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
template void match2L1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
template void match2L1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
template void match2L1_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
template void match2L1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
template <typename T> void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
int cc, cudaStream_t stream)
cudaStream_t stream)
{
if (masks.data)
match2Dispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);
match2Dispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, stream);
else
match2Dispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
match2Dispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, stream);
}
//template void match2L2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
//template void match2L2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
//template void match2L2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
//template void match2L2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
//template void match2L2_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
template void match2L2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
//template void match2L2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
//template void match2L2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
//template void match2L2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
//template void match2L2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
//template void match2L2_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
template void match2L2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
template <typename T> void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
int cc, cudaStream_t stream)
cudaStream_t stream)
{
if (masks.data)
match2Dispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);
match2Dispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, stream);
else
match2Dispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
match2Dispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, stream);
}
template void match2Hamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
//template void match2Hamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
template void match2Hamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
//template void match2Hamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
template void match2Hamming_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
template void match2Hamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
//template void match2Hamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
template void match2Hamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
//template void match2Hamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
template void match2Hamming_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
} // namespace bf_knnmatch
}}} // namespace cv { namespace gpu { namespace device {
#endif /* CUDA_DISABLER */
#endif /* CUDA_DISABLER */

View File

@@ -42,7 +42,9 @@
#if !defined CUDA_DISABLER
#include "internal_shared.hpp"
#include "opencv2/gpu/device/common.hpp"
#include "opencv2/gpu/device/utility.hpp"
#include "opencv2/gpu/device/reduce.hpp"
#include "opencv2/gpu/device/limits.hpp"
#include "opencv2/gpu/device/vec_distance.hpp"
#include "opencv2/gpu/device/datamov_utils.hpp"
@@ -60,12 +62,7 @@ namespace cv { namespace gpu { namespace device
s_distance += threadIdx.y * BLOCK_SIZE;
s_trainIdx += threadIdx.y * BLOCK_SIZE;
s_distance[threadIdx.x] = bestDistance;
s_trainIdx[threadIdx.x] = bestTrainIdx;
__syncthreads();
reducePredVal<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less<volatile float>());
reduceKeyVal<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less<float>());
}
template <int BLOCK_SIZE>
@@ -75,13 +72,7 @@ namespace cv { namespace gpu { namespace device
s_trainIdx += threadIdx.y * BLOCK_SIZE;
s_imgIdx += threadIdx.y * BLOCK_SIZE;
s_distance[threadIdx.x] = bestDistance;
s_trainIdx[threadIdx.x] = bestTrainIdx;
s_imgIdx [threadIdx.x] = bestImgIdx;
__syncthreads();
reducePredVal2<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, s_imgIdx, bestImgIdx, threadIdx.x, less<volatile float>());
reduceKeyVal<BLOCK_SIZE>(s_distance, bestDistance, smem_tuple(s_trainIdx, s_imgIdx), thrust::tie(bestTrainIdx, bestImgIdx), threadIdx.x, less<float>());
}
///////////////////////////////////////////////////////////////////////////////
@@ -567,9 +558,8 @@ namespace cv { namespace gpu { namespace device
template <typename Dist, typename T, typename Mask>
void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
int cc, cudaStream_t stream)
cudaStream_t stream)
{
(void)cc;
if (query.cols <= 64)
{
matchUnrolledCached<16, 64, Dist>(query, train, mask, trainIdx, distance, stream);
@@ -599,9 +589,8 @@ namespace cv { namespace gpu { namespace device
template <typename Dist, typename T, typename Mask>
void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
int cc, cudaStream_t stream)
cudaStream_t stream)
{
(void)cc;
if (query.cols <= 64)
{
matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
@@ -633,153 +622,153 @@ namespace cv { namespace gpu { namespace device
template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
int cc, cudaStream_t stream)
cudaStream_t stream)
{
if (mask.data)
{
matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), SingleMask(mask),
trainIdx, distance,
cc, stream);
stream);
}
else
{
matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), WithOutMask(),
trainIdx, distance,
cc, stream);
stream);
}
}
template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
//template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
template void matchL1_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
//template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
template void matchL1_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
int cc, cudaStream_t stream)
cudaStream_t stream)
{
if (mask.data)
{
matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), SingleMask(mask),
trainIdx, distance,
cc, stream);
stream);
}
else
{
matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), WithOutMask(),
trainIdx, distance,
cc, stream);
stream);
}
}
//template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
//template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
//template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
//template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
//template void matchL2_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
//template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
//template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
//template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
//template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
//template void matchL2_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
int cc, cudaStream_t stream)
cudaStream_t stream)
{
if (mask.data)
{
matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), SingleMask(mask),
trainIdx, distance,
cc, stream);
stream);
}
else
{
matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), WithOutMask(),
trainIdx, distance,
cc, stream);
stream);
}
}
template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
//template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
//template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
template void matchHamming_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
//template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
//template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
template void matchHamming_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
int cc, cudaStream_t stream)
cudaStream_t stream)
{
if (masks.data)
{
matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
trainIdx, imgIdx, distance,
cc, stream);
stream);
}
else
{
matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(),
trainIdx, imgIdx, distance,
cc, stream);
stream);
}
}
template void matchL1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
//template void matchL1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
template void matchL1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
template void matchL1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
template void matchL1_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
template void matchL1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
template void matchL1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
//template void matchL1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
template void matchL1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
template void matchL1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
template void matchL1_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
template void matchL1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
int cc, cudaStream_t stream)
cudaStream_t stream)
{
if (masks.data)
{
matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
trainIdx, imgIdx, distance,
cc, stream);
stream);
}
else
{
matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(),
trainIdx, imgIdx, distance,
cc, stream);
stream);
}
}
//template void matchL2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
//template void matchL2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
//template void matchL2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
//template void matchL2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
//template void matchL2_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
template void matchL2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& maskCollection, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
//template void matchL2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
//template void matchL2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
//template void matchL2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
//template void matchL2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
//template void matchL2_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
template void matchL2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& maskCollection, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
int cc, cudaStream_t stream)
cudaStream_t stream)
{
if (masks.data)
{
matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
trainIdx, imgIdx, distance,
cc, stream);
stream);
}
else
{
matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(),
trainIdx, imgIdx, distance,
cc, stream);
stream);
}
}
template void matchHamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
//template void matchHamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
template void matchHamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
//template void matchHamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
template void matchHamming_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
template void matchHamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
//template void matchHamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
template void matchHamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
//template void matchHamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
template void matchHamming_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
} // namespace bf_match
}}} // namespace cv { namespace gpu { namespace device {
#endif /* CUDA_DISABLER */
#endif /* CUDA_DISABLER */

View File

@@ -42,7 +42,8 @@
#if !defined CUDA_DISABLER
#include "internal_shared.hpp"
#include "opencv2/gpu/device/common.hpp"
#include "opencv2/gpu/device/utility.hpp"
#include "opencv2/gpu/device/limits.hpp"
#include "opencv2/gpu/device/vec_distance.hpp"
#include "opencv2/gpu/device/datamov_utils.hpp"
@@ -58,8 +59,6 @@ namespace cv { namespace gpu { namespace device
__global__ void matchUnrolled(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask,
PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
{
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
extern __shared__ int smem[];
const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
@@ -110,8 +109,6 @@ namespace cv { namespace gpu { namespace device
bestDistance.ptr(queryIdx)[ind] = distVal;
}
}
#endif
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
@@ -170,8 +167,6 @@ namespace cv { namespace gpu { namespace device
__global__ void match(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask,
PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
{
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
extern __shared__ int smem[];
const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
@@ -221,8 +216,6 @@ namespace cv { namespace gpu { namespace device
bestDistance.ptr(queryIdx)[ind] = distVal;
}
}
#endif
}
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
@@ -281,9 +274,8 @@ namespace cv { namespace gpu { namespace device
template <typename Dist, typename T, typename Mask>
void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, float maxDistance, const Mask& mask,
const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
int cc, cudaStream_t stream)
cudaStream_t stream)
{
(void)cc;
if (query.cols <= 64)
{
matchUnrolled<16, 64, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
@@ -313,9 +305,8 @@ namespace cv { namespace gpu { namespace device
template <typename Dist, typename T>
void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, float maxDistance, const PtrStepSzb* masks,
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
int cc, cudaStream_t stream)
cudaStream_t stream)
{
(void)cc;
if (query.cols <= 64)
{
matchUnrolled<16, 64, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
@@ -347,126 +338,126 @@ namespace cv { namespace gpu { namespace device
template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
int cc, cudaStream_t stream)
cudaStream_t stream)
{
if (mask.data)
{
matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, SingleMask(mask),
trainIdx, distance, nMatches,
cc, stream);
stream);
}
else
{
matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, WithOutMask(),
trainIdx, distance, nMatches,
cc, stream);
stream);
}
}
template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchL1_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
//template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template void matchL1_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
int cc, cudaStream_t stream)
cudaStream_t stream)
{
if (mask.data)
{
matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, SingleMask(mask),
trainIdx, distance, nMatches,
cc, stream);
stream);
}
else
{
matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, WithOutMask(),
trainIdx, distance, nMatches,
cc, stream);
stream);
}
}
//template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchL2_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
//template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
//template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
//template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
//template void matchL2_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
int cc, cudaStream_t stream)
cudaStream_t stream)
{
if (mask.data)
{
matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, SingleMask(mask),
trainIdx, distance, nMatches,
cc, stream);
stream);
}
else
{
matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, WithOutMask(),
trainIdx, distance, nMatches,
cc, stream);
stream);
}
}
template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchHamming_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
//template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
//template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template void matchHamming_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
int cc, cudaStream_t stream)
cudaStream_t stream)
{
matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains, n, maxDistance, masks,
trainIdx, imgIdx, distance, nMatches,
cc, stream);
stream);
}
template void matchL1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchL1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchL1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchL1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchL1_gpu<int >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchL1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchL1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
//template void matchL1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template void matchL1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template void matchL1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template void matchL1_gpu<int >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template void matchL1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
int cc, cudaStream_t stream)
cudaStream_t stream)
{
matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains, n, maxDistance, masks,
trainIdx, imgIdx, distance, nMatches,
cc, stream);
stream);
}
//template void matchL2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchL2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchL2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchL2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchL2_gpu<int >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchL2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchL2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
//template void matchL2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
//template void matchL2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
//template void matchL2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
//template void matchL2_gpu<int >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template void matchL2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
int cc, cudaStream_t stream)
cudaStream_t stream)
{
matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains, n, maxDistance, masks,
trainIdx, imgIdx, distance, nMatches,
cc, stream);
stream);
}
template void matchHamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchHamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchHamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchHamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchHamming_gpu<int >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchHamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
//template void matchHamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template void matchHamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
//template void matchHamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template void matchHamming_gpu<int >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
} // namespace bf_radius_match
}}} // namespace cv { namespace gpu { namespace device
#endif /* CUDA_DISABLER */
#endif /* CUDA_DISABLER */

View File

@@ -42,9 +42,10 @@
#if !defined CUDA_DISABLER
#include "internal_shared.hpp"
#include "opencv2/gpu/device/common.hpp"
#include "opencv2/gpu/device/transform.hpp"
#include "opencv2/gpu/device/functional.hpp"
#include "opencv2/gpu/device/reduce.hpp"
namespace cv { namespace gpu { namespace device
{
@@ -66,6 +67,8 @@ namespace cv { namespace gpu { namespace device
crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y,
crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z);
}
__device__ __forceinline__ TransformOp() {}
__device__ __forceinline__ TransformOp(const TransformOp&) {}
};
void call(const PtrStepSz<float3> src, const float* rot,
@@ -103,6 +106,8 @@ namespace cv { namespace gpu { namespace device
(cproj0.x * t.x + cproj0.y * t.y) / t.z + cproj0.z,
(cproj1.x * t.x + cproj1.y * t.y) / t.z + cproj1.z);
}
__device__ __forceinline__ ProjectOp() {}
__device__ __forceinline__ ProjectOp(const ProjectOp&) {}
};
void call(const PtrStepSz<float3> src, const float* rot,
@@ -134,6 +139,7 @@ namespace cv { namespace gpu { namespace device
return x * x;
}
template <int BLOCK_SIZE>
__global__ void computeHypothesisScoresKernel(
const int num_points, const float3* object, const float2* image,
const float dist_threshold, int* g_num_inliers)
@@ -156,19 +162,11 @@ namespace cv { namespace gpu { namespace device
++num_inliers;
}
extern __shared__ float s_num_inliers[];
s_num_inliers[threadIdx.x] = num_inliers;
__syncthreads();
for (int step = blockDim.x / 2; step > 0; step >>= 1)
{
if (threadIdx.x < step)
s_num_inliers[threadIdx.x] += s_num_inliers[threadIdx.x + step];
__syncthreads();
}
__shared__ int s_num_inliers[BLOCK_SIZE];
reduce<BLOCK_SIZE>(s_num_inliers, num_inliers, threadIdx.x, plus<int>());
if (threadIdx.x == 0)
g_num_inliers[blockIdx.x] = s_num_inliers[0];
g_num_inliers[blockIdx.x] = num_inliers;
}
void computeHypothesisScores(
@@ -181,9 +179,8 @@ namespace cv { namespace gpu { namespace device
dim3 threads(256);
dim3 grid(num_hypotheses);
int smem_size = threads.x * sizeof(float);
computeHypothesisScoresKernel<<<grid, threads, smem_size>>>(
computeHypothesisScoresKernel<256><<<grid, threads>>>(
num_points, object, image, dist_threshold, hypothesis_scores);
cudaSafeCall( cudaGetLastError() );
@@ -193,4 +190,4 @@ namespace cv { namespace gpu { namespace device
}}} // namespace cv { namespace gpu { namespace device
#endif /* CUDA_DISABLER */
#endif /* CUDA_DISABLER */

View File

@@ -43,459 +43,451 @@
#if !defined CUDA_DISABLER
#include <utility>
#include <algorithm>
#include "internal_shared.hpp"
#include "opencv2/gpu/device/common.hpp"
#include "opencv2/gpu/device/emulation.hpp"
#include "opencv2/gpu/device/transform.hpp"
#include "opencv2/gpu/device/functional.hpp"
#include "opencv2/gpu/device/utility.hpp"
using namespace cv::gpu;
using namespace cv::gpu::device;
namespace canny
{
struct L1 : binary_function<int, int, float>
{
__device__ __forceinline__ float operator ()(int x, int y) const
{
return ::abs(x) + ::abs(y);
}
__device__ __forceinline__ L1() {}
__device__ __forceinline__ L1(const L1&) {}
};
struct L2 : binary_function<int, int, float>
{
__device__ __forceinline__ float operator ()(int x, int y) const
{
return ::sqrtf(x * x + y * y);
}
__device__ __forceinline__ L2() {}
__device__ __forceinline__ L2(const L2&) {}
};
}
namespace cv { namespace gpu { namespace device
{
namespace canny
template <> struct TransformFunctorTraits<canny::L1> : DefaultTransformFunctorTraits<canny::L1>
{
__global__ void calcSobelRowPass(const PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
enum { smart_shift = 4 };
};
template <> struct TransformFunctorTraits<canny::L2> : DefaultTransformFunctorTraits<canny::L2>
{
enum { smart_shift = 4 };
};
}}}
namespace canny
{
texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_src(false, cudaFilterModePoint, cudaAddressModeClamp);
struct SrcTex
{
const int xoff;
const int yoff;
__host__ SrcTex(int _xoff, int _yoff) : xoff(_xoff), yoff(_yoff) {}
__device__ __forceinline__ int operator ()(int y, int x) const
{
__shared__ int smem[16][18];
return tex2D(tex_src, x + xoff, y + yoff);
}
};
const int j = blockIdx.x * blockDim.x + threadIdx.x;
const int i = blockIdx.y * blockDim.y + threadIdx.y;
template <class Norm> __global__
void calcMagnitudeKernel(const SrcTex src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (i < rows)
{
smem[threadIdx.y][threadIdx.x + 1] = src.ptr(i)[j];
if (threadIdx.x == 0)
{
smem[threadIdx.y][0] = src.ptr(i)[::max(j - 1, 0)];
smem[threadIdx.y][17] = src.ptr(i)[::min(j + 16, cols - 1)];
}
__syncthreads();
if (y >= mag.rows || x >= mag.cols)
return;
if (j < cols)
{
dx_buf.ptr(i)[j] = -smem[threadIdx.y][threadIdx.x] + smem[threadIdx.y][threadIdx.x + 2];
dy_buf.ptr(i)[j] = smem[threadIdx.y][threadIdx.x] + 2 * smem[threadIdx.y][threadIdx.x + 1] + smem[threadIdx.y][threadIdx.x + 2];
}
}
int dxVal = (src(y - 1, x + 1) + 2 * src(y, x + 1) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y, x - 1) + src(y + 1, x - 1));
int dyVal = (src(y + 1, x - 1) + 2 * src(y + 1, x) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y - 1, x) + src(y - 1, x + 1));
dx(y, x) = dxVal;
dy(y, x) = dyVal;
mag(y, x) = norm(dxVal, dyVal);
}
void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad)
{
const dim3 block(16, 16);
const dim3 grid(divUp(mag.cols, block.x), divUp(mag.rows, block.y));
bindTexture(&tex_src, srcWhole);
SrcTex src(xoff, yoff);
if (L2Grad)
{
L2 norm;
calcMagnitudeKernel<<<grid, block>>>(src, dx, dy, mag, norm);
}
else
{
L1 norm;
calcMagnitudeKernel<<<grid, block>>>(src, dx, dy, mag, norm);
}
void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
}
void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad)
{
if (L2Grad)
{
dim3 block(16, 16, 1);
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
calcSobelRowPass<<<grid, block>>>(src, dx_buf, dy_buf, rows, cols);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
L2 norm;
transform(dx, dy, mag, norm, WithOutMask(), 0);
}
struct L1
else
{
static __device__ __forceinline__ float calc(int x, int y)
{
return ::abs(x) + ::abs(y);
}
};
struct L2
{
static __device__ __forceinline__ float calc(int x, int y)
{
return ::sqrtf(x * x + y * y);
}
};
template <typename Norm> __global__ void calcMagnitude(const PtrStepi dx_buf, const PtrStepi dy_buf,
PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
{
__shared__ int sdx[18][16];
__shared__ int sdy[18][16];
const int j = blockIdx.x * blockDim.x + threadIdx.x;
const int i = blockIdx.y * blockDim.y + threadIdx.y;
if (j < cols)
{
sdx[threadIdx.y + 1][threadIdx.x] = dx_buf.ptr(i)[j];
sdy[threadIdx.y + 1][threadIdx.x] = dy_buf.ptr(i)[j];
if (threadIdx.y == 0)
{
sdx[0][threadIdx.x] = dx_buf.ptr(::max(i - 1, 0))[j];
sdx[17][threadIdx.x] = dx_buf.ptr(::min(i + 16, rows - 1))[j];
sdy[0][threadIdx.x] = dy_buf.ptr(::max(i - 1, 0))[j];
sdy[17][threadIdx.x] = dy_buf.ptr(::min(i + 16, rows - 1))[j];
}
__syncthreads();
if (i < rows)
{
int x = sdx[threadIdx.y][threadIdx.x] + 2 * sdx[threadIdx.y + 1][threadIdx.x] + sdx[threadIdx.y + 2][threadIdx.x];
int y = -sdy[threadIdx.y][threadIdx.x] + sdy[threadIdx.y + 2][threadIdx.x];
dx.ptr(i)[j] = x;
dy.ptr(i)[j] = y;
mag.ptr(i + 1)[j + 1] = Norm::calc(x, y);
}
}
L1 norm;
transform(dx, dy, mag, norm, WithOutMask(), 0);
}
}
}
void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)
//////////////////////////////////////////////////////////////////////////////////////////
namespace canny
{
texture<float, cudaTextureType2D, cudaReadModeElementType> tex_mag(false, cudaFilterModePoint, cudaAddressModeClamp);
__global__ void calcMapKernel(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh)
{
const int CANNY_SHIFT = 15;
const int TG22 = (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5);
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x == 0 || x >= dx.cols - 1 || y == 0 || y >= dx.rows - 1)
return;
int dxVal = dx(y, x);
int dyVal = dy(y, x);
const int s = (dxVal ^ dyVal) < 0 ? -1 : 1;
const float m = tex2D(tex_mag, x, y);
dxVal = ::abs(dxVal);
dyVal = ::abs(dyVal);
// 0 - the pixel can not belong to an edge
// 1 - the pixel might belong to an edge
// 2 - the pixel does belong to an edge
int edge_type = 0;
if (m > low_thresh)
{
dim3 block(16, 16, 1);
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
const int tg22x = dxVal * TG22;
const int tg67x = tg22x + ((dxVal + dxVal) << CANNY_SHIFT);
if (L2Grad)
calcMagnitude<L2><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols);
dyVal <<= CANNY_SHIFT;
if (dyVal < tg22x)
{
if (m > tex2D(tex_mag, x - 1, y) && m >= tex2D(tex_mag, x + 1, y))
edge_type = 1 + (int)(m > high_thresh);
}
else if(dyVal > tg67x)
{
if (m > tex2D(tex_mag, x, y - 1) && m >= tex2D(tex_mag, x, y + 1))
edge_type = 1 + (int)(m > high_thresh);
}
else
calcMagnitude<L1><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
{
if (m > tex2D(tex_mag, x - s, y - 1) && m >= tex2D(tex_mag, x + s, y + 1))
edge_type = 1 + (int)(m > high_thresh);
}
}
template <typename Norm> __global__ void calcMagnitude(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
{
const int j = blockIdx.x * blockDim.x + threadIdx.x;
const int i = blockIdx.y * blockDim.y + threadIdx.y;
map(y, x) = edge_type;
}
if (i < rows && j < cols)
mag.ptr(i + 1)[j + 1] = Norm::calc(dx.ptr(i)[j], dy.ptr(i)[j]);
void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh)
{
const dim3 block(16, 16);
const dim3 grid(divUp(dx.cols, block.x), divUp(dx.rows, block.y));
bindTexture(&tex_mag, mag);
calcMapKernel<<<grid, block>>>(dx, dy, map, low_thresh, high_thresh);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
}
//////////////////////////////////////////////////////////////////////////////////////////
namespace canny
{
__device__ int counter = 0;
__global__ void edgesHysteresisLocalKernel(PtrStepSzi map, ushort2* st)
{
__shared__ volatile int smem[18][18];
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
smem[threadIdx.y + 1][threadIdx.x + 1] = x < map.cols && y < map.rows ? map(y, x) : 0;
if (threadIdx.y == 0)
smem[0][threadIdx.x + 1] = y > 0 ? map(y - 1, x) : 0;
if (threadIdx.y == blockDim.y - 1)
smem[blockDim.y + 1][threadIdx.x + 1] = y + 1 < map.rows ? map(y + 1, x) : 0;
if (threadIdx.x == 0)
smem[threadIdx.y + 1][0] = x > 0 ? map(y, x - 1) : 0;
if (threadIdx.x == blockDim.x - 1)
smem[threadIdx.y + 1][blockDim.x + 1] = x + 1 < map.cols ? map(y, x + 1) : 0;
if (threadIdx.x == 0 && threadIdx.y == 0)
smem[0][0] = y > 0 && x > 0 ? map(y - 1, x - 1) : 0;
if (threadIdx.x == blockDim.x - 1 && threadIdx.y == 0)
smem[0][blockDim.x + 1] = y > 0 && x + 1 < map.cols ? map(y - 1, x + 1) : 0;
if (threadIdx.x == 0 && threadIdx.y == blockDim.y - 1)
smem[blockDim.y + 1][0] = y + 1 < map.rows && x > 0 ? map(y + 1, x - 1) : 0;
if (threadIdx.x == blockDim.x - 1 && threadIdx.y == blockDim.y - 1)
smem[blockDim.y + 1][blockDim.x + 1] = y + 1 < map.rows && x + 1 < map.cols ? map(y + 1, x + 1) : 0;
__syncthreads();
if (x >= map.cols || y >= map.rows)
return;
int n;
#pragma unroll
for (int k = 0; k < 16; ++k)
{
n = 0;
if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1)
{
n += smem[threadIdx.y ][threadIdx.x ] == 2;
n += smem[threadIdx.y ][threadIdx.x + 1] == 2;
n += smem[threadIdx.y ][threadIdx.x + 2] == 2;
n += smem[threadIdx.y + 1][threadIdx.x ] == 2;
n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2;
n += smem[threadIdx.y + 2][threadIdx.x ] == 2;
n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2;
n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;
}
if (n > 0)
smem[threadIdx.y + 1][threadIdx.x + 1] = 2;
}
void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)
const int e = smem[threadIdx.y + 1][threadIdx.x + 1];
map(y, x) = e;
n = 0;
if (e == 2)
{
dim3 block(16, 16, 1);
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
n += smem[threadIdx.y ][threadIdx.x ] == 1;
n += smem[threadIdx.y ][threadIdx.x + 1] == 1;
n += smem[threadIdx.y ][threadIdx.x + 2] == 1;
if (L2Grad)
calcMagnitude<L2><<<grid, block>>>(dx, dy, mag, rows, cols);
else
calcMagnitude<L1><<<grid, block>>>(dx, dy, mag, rows, cols);
n += smem[threadIdx.y + 1][threadIdx.x ] == 1;
n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
n += smem[threadIdx.y + 2][threadIdx.x ] == 1;
n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;
n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;
}
//////////////////////////////////////////////////////////////////////////////////////////
#define CANNY_SHIFT 15
#define TG22 (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5)
__global__ void calcMap(const PtrStepi dx, const PtrStepi dy, const PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)
if (n > 0)
{
__shared__ float smem[18][18];
const int ind = ::atomicAdd(&counter, 1);
st[ind] = make_ushort2(x, y);
}
}
const int j = blockIdx.x * 16 + threadIdx.x;
const int i = blockIdx.y * 16 + threadIdx.y;
void edgesHysteresisLocal(PtrStepSzi map, ushort2* st1)
{
void* counter_ptr;
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
const int tid = threadIdx.y * 16 + threadIdx.x;
const int lx = tid % 18;
const int ly = tid / 18;
cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
if (ly < 14)
smem[ly][lx] = mag.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx];
const dim3 block(16, 16);
const dim3 grid(divUp(map.cols, block.x), divUp(map.rows, block.y));
if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols)
smem[ly + 14][lx] = mag.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx];
edgesHysteresisLocalKernel<<<grid, block>>>(map, st1);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
}
//////////////////////////////////////////////////////////////////////////////////////////
namespace canny
{
__constant__ int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1};
__constant__ int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1};
__global__ void edgesHysteresisGlobalKernel(PtrStepSzi map, ushort2* st1, ushort2* st2, const int count)
{
const int stack_size = 512;
__shared__ int s_counter;
__shared__ int s_ind;
__shared__ ushort2 s_st[stack_size];
if (threadIdx.x == 0)
s_counter = 0;
__syncthreads();
int ind = blockIdx.y * gridDim.x + blockIdx.x;
if (ind >= count)
return;
ushort2 pos = st1[ind];
if (threadIdx.x < 8)
{
pos.x += c_dx[threadIdx.x];
pos.y += c_dy[threadIdx.x];
if (pos.x > 0 && pos.x < map.cols && pos.y > 0 && pos.y < map.rows && map(pos.y, pos.x) == 1)
{
map(pos.y, pos.x) = 2;
ind = Emulation::smem::atomicAdd(&s_counter, 1);
s_st[ind] = pos;
}
}
__syncthreads();
while (s_counter > 0 && s_counter <= stack_size - blockDim.x)
{
const int subTaskIdx = threadIdx.x >> 3;
const int portion = ::min(s_counter, blockDim.x >> 3);
if (subTaskIdx < portion)
pos = s_st[s_counter - 1 - subTaskIdx];
__syncthreads();
if (i < rows && j < cols)
{
int x = dx.ptr(i)[j];
int y = dy.ptr(i)[j];
const int s = (x ^ y) < 0 ? -1 : 1;
const float m = smem[threadIdx.y + 1][threadIdx.x + 1];
x = ::abs(x);
y = ::abs(y);
// 0 - the pixel can not belong to an edge
// 1 - the pixel might belong to an edge
// 2 - the pixel does belong to an edge
int edge_type = 0;
if (m > low_thresh)
{
const int tg22x = x * TG22;
const int tg67x = tg22x + ((x + x) << CANNY_SHIFT);
y <<= CANNY_SHIFT;
if (y < tg22x)
{
if (m > smem[threadIdx.y + 1][threadIdx.x] && m >= smem[threadIdx.y + 1][threadIdx.x + 2])
edge_type = 1 + (int)(m > high_thresh);
}
else if( y > tg67x )
{
if (m > smem[threadIdx.y][threadIdx.x + 1] && m >= smem[threadIdx.y + 2][threadIdx.x + 1])
edge_type = 1 + (int)(m > high_thresh);
}
else
{
if (m > smem[threadIdx.y][threadIdx.x + 1 - s] && m > smem[threadIdx.y + 2][threadIdx.x + 1 + s])
edge_type = 1 + (int)(m > high_thresh);
}
}
map.ptr(i + 1)[j + 1] = edge_type;
}
}
#undef CANNY_SHIFT
#undef TG22
void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)
{
dim3 block(16, 16, 1);
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
calcMap<<<grid, block>>>(dx, dy, mag, map, rows, cols, low_thresh, high_thresh);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
//////////////////////////////////////////////////////////////////////////////////////////
__device__ unsigned int counter = 0;
__global__ void edgesHysteresisLocal(PtrStepi map, ushort2* st, int rows, int cols)
{
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 120)
__shared__ int smem[18][18];
const int j = blockIdx.x * 16 + threadIdx.x;
const int i = blockIdx.y * 16 + threadIdx.y;
const int tid = threadIdx.y * 16 + threadIdx.x;
const int lx = tid % 18;
const int ly = tid / 18;
if (ly < 14)
smem[ly][lx] = map.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx];
if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols)
smem[ly + 14][lx] = map.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx];
__syncthreads();
if (i < rows && j < cols)
{
int n;
#pragma unroll
for (int k = 0; k < 16; ++k)
{
n = 0;
if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1)
{
n += smem[threadIdx.y ][threadIdx.x ] == 2;
n += smem[threadIdx.y ][threadIdx.x + 1] == 2;
n += smem[threadIdx.y ][threadIdx.x + 2] == 2;
n += smem[threadIdx.y + 1][threadIdx.x ] == 2;
n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2;
n += smem[threadIdx.y + 2][threadIdx.x ] == 2;
n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2;
n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;
}
if (n > 0)
smem[threadIdx.y + 1][threadIdx.x + 1] = 2;
}
const int e = smem[threadIdx.y + 1][threadIdx.x + 1];
map.ptr(i + 1)[j + 1] = e;
n = 0;
if (e == 2)
{
n += smem[threadIdx.y ][threadIdx.x ] == 1;
n += smem[threadIdx.y ][threadIdx.x + 1] == 1;
n += smem[threadIdx.y ][threadIdx.x + 2] == 1;
n += smem[threadIdx.y + 1][threadIdx.x ] == 1;
n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;
n += smem[threadIdx.y + 2][threadIdx.x ] == 1;
n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;
n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;
}
if (n > 0)
{
const unsigned int ind = atomicInc(&counter, (unsigned int)(-1));
st[ind] = make_ushort2(j + 1, i + 1);
}
}
#endif
}
void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols)
{
void* counter_ptr;
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) );
dim3 block(16, 16, 1);
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
edgesHysteresisLocal<<<grid, block>>>(map, st1, rows, cols);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
__constant__ int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1};
__constant__ int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1};
__global__ void edgesHysteresisGlobal(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols, int count)
{
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 120
const int stack_size = 512;
__shared__ unsigned int s_counter;
__shared__ unsigned int s_ind;
__shared__ ushort2 s_st[stack_size];
if (threadIdx.x == 0)
s_counter = 0;
s_counter -= portion;
__syncthreads();
int ind = blockIdx.y * gridDim.x + blockIdx.x;
if (ind < count)
if (subTaskIdx < portion)
{
ushort2 pos = st1[ind];
pos.x += c_dx[threadIdx.x & 7];
pos.y += c_dy[threadIdx.x & 7];
if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
if (pos.x > 0 && pos.x < map.cols && pos.y > 0 && pos.y < map.rows && map(pos.y, pos.x) == 1)
{
if (threadIdx.x < 8)
{
pos.x += c_dx[threadIdx.x];
pos.y += c_dy[threadIdx.x];
map(pos.y, pos.x) = 2;
if (map.ptr(pos.y)[pos.x] == 1)
{
map.ptr(pos.y)[pos.x] = 2;
ind = Emulation::smem::atomicAdd(&s_counter, 1);
ind = atomicInc(&s_counter, (unsigned int)(-1));
s_st[ind] = pos;
}
}
__syncthreads();
while (s_counter > 0 && s_counter <= stack_size - blockDim.x)
{
const int subTaskIdx = threadIdx.x >> 3;
const int portion = ::min(s_counter, blockDim.x >> 3);
pos.x = pos.y = 0;
if (subTaskIdx < portion)
pos = s_st[s_counter - 1 - subTaskIdx];
__syncthreads();
if (threadIdx.x == 0)
s_counter -= portion;
__syncthreads();
if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
{
pos.x += c_dx[threadIdx.x & 7];
pos.y += c_dy[threadIdx.x & 7];
if (map.ptr(pos.y)[pos.x] == 1)
{
map.ptr(pos.y)[pos.x] = 2;
ind = atomicInc(&s_counter, (unsigned int)(-1));
s_st[ind] = pos;
}
}
__syncthreads();
}
if (s_counter > 0)
{
if (threadIdx.x == 0)
{
ind = atomicAdd(&counter, s_counter);
s_ind = ind - s_counter;
}
__syncthreads();
ind = s_ind;
for (int i = threadIdx.x; i < s_counter; i += blockDim.x)
{
st2[ind + i] = s_st[i];
}
}
s_st[ind] = pos;
}
}
#endif
__syncthreads();
}
void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols)
if (s_counter > 0)
{
void* counter_ptr;
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
unsigned int count;
cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
while (count > 0)
if (threadIdx.x == 0)
{
cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) );
dim3 block(128, 1, 1);
dim3 grid(std::min(count, 65535u), divUp(count, 65535), 1);
edgesHysteresisGlobal<<<grid, block>>>(map, st1, st2, rows, cols, count);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
std::swap(st1, st2);
ind = ::atomicAdd(&counter, s_counter);
s_ind = ind - s_counter;
}
__syncthreads();
ind = s_ind;
for (int i = threadIdx.x; i < s_counter; i += blockDim.x)
st2[ind + i] = s_st[i];
}
}
__global__ void getEdges(PtrStepi map, PtrStepb dst, int rows, int cols)
void edgesHysteresisGlobal(PtrStepSzi map, ushort2* st1, ushort2* st2)
{
void* counter_ptr;
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, canny::counter) );
int count;
cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
while (count > 0)
{
const int j = blockIdx.x * 16 + threadIdx.x;
const int i = blockIdx.y * 16 + threadIdx.y;
cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
if (i < rows && j < cols)
dst.ptr(i)[j] = (uchar)(-(map.ptr(i + 1)[j + 1] >> 1));
}
const dim3 block(128);
const dim3 grid(::min(count, 65535u), divUp(count, 65535), 1);
void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols)
{
dim3 block(16, 16, 1);
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
getEdges<<<grid, block>>>(map, dst, rows, cols);
edgesHysteresisGlobalKernel<<<grid, block>>>(map, st1, st2, count);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
std::swap(st1, st2);
}
} // namespace canny
}}} // namespace cv { namespace gpu { namespace device
}
}
//////////////////////////////////////////////////////////////////////////////////////////
#endif /* CUDA_DISABLER */
namespace canny
{
struct GetEdges : unary_function<int, uchar>
{
__device__ __forceinline__ uchar operator ()(int e) const
{
return (uchar)(-(e >> 1));
}
__device__ __forceinline__ GetEdges() {}
__device__ __forceinline__ GetEdges(const GetEdges&) {}
};
}
namespace cv { namespace gpu { namespace device
{
template <> struct TransformFunctorTraits<canny::GetEdges> : DefaultTransformFunctorTraits<canny::GetEdges>
{
enum { smart_shift = 4 };
};
}}}
namespace canny
{
void getEdges(PtrStepSzi map, PtrStepSzb dst)
{
transform(map, dst, GetEdges(), WithOutMask(), 0);
}
}
#endif /* CUDA_DISABLER */

File diff suppressed because it is too large Load Diff

View File

@@ -46,6 +46,8 @@
#include "opencv2/gpu/device/vec_math.hpp"
#include "opencv2/gpu/device/limits.hpp"
#include "opencv2/gpu/device/utility.hpp"
#include "opencv2/gpu/device/reduce.hpp"
#include "opencv2/gpu/device/functional.hpp"
#include "fgd_bgfg_common.hpp"
using namespace cv::gpu;
@@ -181,57 +183,8 @@ namespace bgfg
__shared__ unsigned int data1[MERGE_THREADBLOCK_SIZE];
__shared__ unsigned int data2[MERGE_THREADBLOCK_SIZE];
data0[threadIdx.x] = sum0;
data1[threadIdx.x] = sum1;
data2[threadIdx.x] = sum2;
__syncthreads();
if (threadIdx.x < 128)
{
data0[threadIdx.x] = sum0 += data0[threadIdx.x + 128];
data1[threadIdx.x] = sum1 += data1[threadIdx.x + 128];
data2[threadIdx.x] = sum2 += data2[threadIdx.x + 128];
}
__syncthreads();
if (threadIdx.x < 64)
{
data0[threadIdx.x] = sum0 += data0[threadIdx.x + 64];
data1[threadIdx.x] = sum1 += data1[threadIdx.x + 64];
data2[threadIdx.x] = sum2 += data2[threadIdx.x + 64];
}
__syncthreads();
if (threadIdx.x < 32)
{
volatile unsigned int* vdata0 = data0;
volatile unsigned int* vdata1 = data1;
volatile unsigned int* vdata2 = data2;
vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 32];
vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 32];
vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 32];
vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 16];
vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 16];
vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 16];
vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 8];
vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 8];
vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 8];
vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 4];
vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 4];
vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 4];
vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 2];
vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 2];
vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 2];
vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 1];
vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 1];
vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 1];
}
plus<unsigned int> op;
reduce<MERGE_THREADBLOCK_SIZE>(smem_tuple(data0, data1, data2), thrust::tie(sum0, sum1, sum2), threadIdx.x, thrust::make_tuple(op, op, op));
if(threadIdx.x == 0)
{
@@ -245,9 +198,9 @@ namespace bgfg
void calcDiffHistogram_gpu(PtrStepSzb prevFrame, PtrStepSzb curFrame,
unsigned int* hist0, unsigned int* hist1, unsigned int* hist2,
unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2,
int cc, cudaStream_t stream)
bool cc20, cudaStream_t stream)
{
const int HISTOGRAM_WARP_COUNT = cc < 20 ? 4 : 6;
const int HISTOGRAM_WARP_COUNT = cc20 ? 6 : 4;
const int HISTOGRAM_THREADBLOCK_SIZE = HISTOGRAM_WARP_COUNT * WARP_SIZE;
calcPartialHistogram<PT, CT><<<PARTIAL_HISTOGRAM_COUNT, HISTOGRAM_THREADBLOCK_SIZE, 0, stream>>>(
@@ -261,10 +214,10 @@ namespace bgfg
cudaSafeCall( cudaDeviceSynchronize() );
}
template void calcDiffHistogram_gpu<uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, int cc, cudaStream_t stream);
template void calcDiffHistogram_gpu<uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, int cc, cudaStream_t stream);
template void calcDiffHistogram_gpu<uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, int cc, cudaStream_t stream);
template void calcDiffHistogram_gpu<uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, int cc, cudaStream_t stream);
template void calcDiffHistogram_gpu<uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
template void calcDiffHistogram_gpu<uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
template void calcDiffHistogram_gpu<uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
template void calcDiffHistogram_gpu<uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
/////////////////////////////////////////////////////////////////////////
// calcDiffThreshMask
@@ -845,4 +798,4 @@ namespace bgfg
template void updateBackgroundModel_gpu<uchar4, uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
}
#endif /* CUDA_DISABLER */
#endif /* CUDA_DISABLER */

View File

@@ -125,7 +125,7 @@ namespace bgfg
void calcDiffHistogram_gpu(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame,
unsigned int* hist0, unsigned int* hist1, unsigned int* hist2,
unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2,
int cc, cudaStream_t stream);
bool cc20, cudaStream_t stream);
template <typename PT, typename CT>
void calcDiffThreshMask_gpu(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame, uchar3 bestThres, cv::gpu::PtrStepSzb changeMask, cudaStream_t stream);

View File

@@ -43,12 +43,10 @@
#if !defined CUDA_DISABLER
#include "thrust/device_ptr.h"
#include "thrust/remove.h"
#include "thrust/functional.h"
#include "internal_shared.hpp"
using namespace thrust;
#include <thrust/device_ptr.h>
#include <thrust/remove.h>
#include <thrust/functional.h>
#include "opencv2/gpu/device/common.hpp"
namespace cv { namespace gpu { namespace device { namespace globmotion {
@@ -61,10 +59,10 @@ int compactPoints(int N, float *points0, float *points1, const uchar *mask)
thrust::device_ptr<float2> dpoints1((float2*)points1);
thrust::device_ptr<const uchar> dmask(mask);
return thrust::remove_if(thrust::make_zip_iterator(thrust::make_tuple(dpoints0, dpoints1)),
return (int)(thrust::remove_if(thrust::make_zip_iterator(thrust::make_tuple(dpoints0, dpoints1)),
thrust::make_zip_iterator(thrust::make_tuple(dpoints0 + N, dpoints1 + N)),
dmask, thrust::not1(thrust::identity<uchar>()))
- make_zip_iterator(make_tuple(dpoints0, dpoints1));
- thrust::make_zip_iterator(make_tuple(dpoints0, dpoints1)));
}
@@ -117,4 +115,4 @@ void calcWobbleSuppressionMaps(
}}}}
#endif /* CUDA_DISABLER */
#endif /* CUDA_DISABLER */

View File

@@ -43,182 +43,112 @@
#if !defined CUDA_DISABLER
#include "internal_shared.hpp"
#include "opencv2/gpu/device/utility.hpp"
#include "opencv2/gpu/device/saturate_cast.hpp"
#include "opencv2/gpu/device/common.hpp"
#include "opencv2/gpu/device/functional.hpp"
#include "opencv2/gpu/device/emulation.hpp"
#include "opencv2/gpu/device/transform.hpp"
using namespace cv::gpu;
using namespace cv::gpu::device;
namespace hist
{
__global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t step, int* hist)
{
__shared__ int shist[256];
const int y = blockIdx.x * blockDim.y + threadIdx.y;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
shist[tid] = 0;
__syncthreads();
if (y < rows)
{
const unsigned int* rowPtr = (const unsigned int*) (src + y * step);
const int cols_4 = cols / 4;
for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
{
unsigned int data = rowPtr[x];
Emulation::smem::atomicAdd(&shist[(data >> 0) & 0xFFU], 1);
Emulation::smem::atomicAdd(&shist[(data >> 8) & 0xFFU], 1);
Emulation::smem::atomicAdd(&shist[(data >> 16) & 0xFFU], 1);
Emulation::smem::atomicAdd(&shist[(data >> 24) & 0xFFU], 1);
}
if (cols % 4 != 0 && threadIdx.x == 0)
{
for (int x = cols_4 * 4; x < cols; ++x)
{
unsigned int data = ((const uchar*)rowPtr)[x];
Emulation::smem::atomicAdd(&shist[data], 1);
}
}
}
__syncthreads();
const int histVal = shist[tid];
if (histVal > 0)
::atomicAdd(hist + tid, histVal);
}
void histogram256(PtrStepSzb src, int* hist, cudaStream_t stream)
{
const dim3 block(32, 8);
const dim3 grid(divUp(src.rows, block.y));
histogram256Kernel<<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, hist);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
}
/////////////////////////////////////////////////////////////////////////
namespace hist
{
__constant__ int c_lut[256];
struct EqualizeHist : unary_function<uchar, uchar>
{
float scale;
__host__ EqualizeHist(float _scale) : scale(_scale) {}
__device__ __forceinline__ uchar operator ()(uchar val) const
{
const int lut = c_lut[val];
return __float2int_rn(scale * lut);
}
};
}
namespace cv { namespace gpu { namespace device
{
#define UINT_BITS 32U
//Warps == subhistograms per threadblock
#define WARP_COUNT 6
//Threadblock size
#define HISTOGRAM256_THREADBLOCK_SIZE (WARP_COUNT * OPENCV_GPU_WARP_SIZE)
#define HISTOGRAM256_BIN_COUNT 256
//Shared memory per threadblock
#define HISTOGRAM256_THREADBLOCK_MEMORY (WARP_COUNT * HISTOGRAM256_BIN_COUNT)
#define PARTIAL_HISTOGRAM256_COUNT 240
#define MERGE_THREADBLOCK_SIZE 256
#define USE_SMEM_ATOMICS (defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 120))
namespace hist
template <> struct TransformFunctorTraits<hist::EqualizeHist> : DefaultTransformFunctorTraits<hist::EqualizeHist>
{
#if (!USE_SMEM_ATOMICS)
#define TAG_MASK ( (1U << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE)) - 1U )
__forceinline__ __device__ void addByte(volatile uint* s_WarpHist, uint data, uint threadTag)
{
uint count;
do
{
count = s_WarpHist[data] & TAG_MASK;
count = threadTag | (count + 1);
s_WarpHist[data] = count;
} while (s_WarpHist[data] != count);
}
#else
#define TAG_MASK 0xFFFFFFFFU
__forceinline__ __device__ void addByte(uint* s_WarpHist, uint data, uint threadTag)
{
atomicAdd(s_WarpHist + data, 1);
}
#endif
__forceinline__ __device__ void addWord(uint* s_WarpHist, uint data, uint tag, uint pos_x, uint cols)
{
uint x = pos_x << 2;
if (x + 0 < cols) addByte(s_WarpHist, (data >> 0) & 0xFFU, tag);
if (x + 1 < cols) addByte(s_WarpHist, (data >> 8) & 0xFFU, tag);
if (x + 2 < cols) addByte(s_WarpHist, (data >> 16) & 0xFFU, tag);
if (x + 3 < cols) addByte(s_WarpHist, (data >> 24) & 0xFFU, tag);
}
__global__ void histogram256(const PtrStep<uint> d_Data, uint* d_PartialHistograms, uint dataCount, uint cols)
{
//Per-warp subhistogram storage
__shared__ uint s_Hist[HISTOGRAM256_THREADBLOCK_MEMORY];
uint* s_WarpHist= s_Hist + (threadIdx.x >> OPENCV_GPU_LOG_WARP_SIZE) * HISTOGRAM256_BIN_COUNT;
//Clear shared memory storage for current threadblock before processing
#pragma unroll
for (uint i = 0; i < (HISTOGRAM256_THREADBLOCK_MEMORY / HISTOGRAM256_THREADBLOCK_SIZE); i++)
s_Hist[threadIdx.x + i * HISTOGRAM256_THREADBLOCK_SIZE] = 0;
//Cycle through the entire data set, update subhistograms for each warp
const uint tag = threadIdx.x << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE);
__syncthreads();
const uint colsui = d_Data.step / sizeof(uint);
for(uint pos = blockIdx.x * blockDim.x + threadIdx.x; pos < dataCount; pos += blockDim.x * gridDim.x)
{
uint pos_y = pos / colsui;
uint pos_x = pos % colsui;
uint data = d_Data.ptr(pos_y)[pos_x];
addWord(s_WarpHist, data, tag, pos_x, cols);
}
//Merge per-warp histograms into per-block and write to global memory
__syncthreads();
for(uint bin = threadIdx.x; bin < HISTOGRAM256_BIN_COUNT; bin += HISTOGRAM256_THREADBLOCK_SIZE)
{
uint sum = 0;
for (uint i = 0; i < WARP_COUNT; i++)
sum += s_Hist[bin + i * HISTOGRAM256_BIN_COUNT] & TAG_MASK;
d_PartialHistograms[blockIdx.x * HISTOGRAM256_BIN_COUNT + bin] = sum;
}
}
////////////////////////////////////////////////////////////////////////////////
// Merge histogram256() output
// Run one threadblock per bin; each threadblock adds up the same bin counter
// from every partial histogram. Reads are uncoalesced, but mergeHistogram256
// takes only a fraction of total processing time
////////////////////////////////////////////////////////////////////////////////
__global__ void mergeHistogram256(const uint* d_PartialHistograms, int* d_Histogram)
{
uint sum = 0;
#pragma unroll
for (uint i = threadIdx.x; i < PARTIAL_HISTOGRAM256_COUNT; i += MERGE_THREADBLOCK_SIZE)
sum += d_PartialHistograms[blockIdx.x + i * HISTOGRAM256_BIN_COUNT];
__shared__ uint data[MERGE_THREADBLOCK_SIZE];
data[threadIdx.x] = sum;
for (uint stride = MERGE_THREADBLOCK_SIZE / 2; stride > 0; stride >>= 1)
{
__syncthreads();
if(threadIdx.x < stride)
data[threadIdx.x] += data[threadIdx.x + stride];
}
if(threadIdx.x == 0)
d_Histogram[blockIdx.x] = saturate_cast<int>(data[0]);
}
void histogram256_gpu(PtrStepSzb src, int* hist, uint* buf, cudaStream_t stream)
{
histogram256<<<PARTIAL_HISTOGRAM256_COUNT, HISTOGRAM256_THREADBLOCK_SIZE, 0, stream>>>(
PtrStepSz<uint>(src),
buf,
static_cast<uint>(src.rows * src.step / sizeof(uint)),
src.cols);
cudaSafeCall( cudaGetLastError() );
mergeHistogram256<<<HISTOGRAM256_BIN_COUNT, MERGE_THREADBLOCK_SIZE, 0, stream>>>(buf, hist);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
__constant__ int c_lut[256];
__global__ void equalizeHist(const PtrStepSzb src, PtrStepb dst)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < src.cols && y < src.rows)
{
const uchar val = src.ptr(y)[x];
const int lut = c_lut[val];
dst.ptr(y)[x] = __float2int_rn(255.0f / (src.cols * src.rows) * lut);
}
}
void equalizeHist_gpu(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream)
{
dim3 block(16, 16);
dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
enum { smart_shift = 4 };
};
}}}
namespace hist
{
void equalizeHist(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream)
{
if (stream == 0)
cudaSafeCall( cudaMemcpyToSymbol(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice) );
else
cudaSafeCall( cudaMemcpyToSymbolAsync(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice, stream) );
equalizeHist<<<grid, block, 0, stream>>>(src, dst);
cudaSafeCall( cudaGetLastError() );
const float scale = 255.0f / (src.cols * src.rows);
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
} // namespace hist
}}} // namespace cv { namespace gpu { namespace device
transform(src, dst, EqualizeHist(scale), WithOutMask(), stream);
}
}
#endif /* CUDA_DISABLER */
#endif /* CUDA_DISABLER */

View File

@@ -42,7 +42,10 @@
#if !defined CUDA_DISABLER
#include "internal_shared.hpp"
#include "opencv2/gpu/device/common.hpp"
#include "opencv2/gpu/device/reduce.hpp"
#include "opencv2/gpu/device/functional.hpp"
#include "opencv2/gpu/device/warp_shuffle.hpp"
namespace cv { namespace gpu { namespace device
{
@@ -226,29 +229,32 @@ namespace cv { namespace gpu { namespace device
template<int size>
__device__ float reduce_smem(volatile float* smem)
__device__ float reduce_smem(float* smem, float val)
{
unsigned int tid = threadIdx.x;
float sum = smem[tid];
float sum = val;
if (size >= 512) { if (tid < 256) smem[tid] = sum = sum + smem[tid + 256]; __syncthreads(); }
if (size >= 256) { if (tid < 128) smem[tid] = sum = sum + smem[tid + 128]; __syncthreads(); }
if (size >= 128) { if (tid < 64) smem[tid] = sum = sum + smem[tid + 64]; __syncthreads(); }
reduce<size>(smem, sum, tid, plus<float>());
if (tid < 32)
if (size == 32)
{
if (size >= 64) smem[tid] = sum = sum + smem[tid + 32];
if (size >= 32) smem[tid] = sum = sum + smem[tid + 16];
if (size >= 16) smem[tid] = sum = sum + smem[tid + 8];
if (size >= 8) smem[tid] = sum = sum + smem[tid + 4];
if (size >= 4) smem[tid] = sum = sum + smem[tid + 2];
if (size >= 2) smem[tid] = sum = sum + smem[tid + 1];
#if __CUDA_ARCH__ >= 300
return shfl(sum, 0);
#else
return smem[0];
#endif
}
else
{
#if __CUDA_ARCH__ >= 300
if (threadIdx.x == 0)
smem[0] = sum;
#endif
__syncthreads();
sum = smem[0];
__syncthreads();
return sum;
return smem[0];
}
}
@@ -272,19 +278,13 @@ namespace cv { namespace gpu { namespace device
if (threadIdx.x < block_hist_size)
elem = hist[0];
squares[threadIdx.x] = elem * elem;
__syncthreads();
float sum = reduce_smem<nthreads>(squares);
float sum = reduce_smem<nthreads>(squares, elem * elem);
float scale = 1.0f / (::sqrtf(sum) + 0.1f * block_hist_size);
elem = ::min(elem * scale, threshold);
__syncthreads();
squares[threadIdx.x] = elem * elem;
sum = reduce_smem<nthreads>(squares, elem * elem);
__syncthreads();
sum = reduce_smem<nthreads>(squares);
scale = 1.0f / (::sqrtf(sum) + 1e-3f);
if (threadIdx.x < block_hist_size)
@@ -330,65 +330,36 @@ namespace cv { namespace gpu { namespace device
// return confidence values not just positive location
template <int nthreads, // Number of threads per one histogram block
int nblocks> // Number of histogram block processed by single GPU thread block
int nblocks> // Number of histogram block processed by single GPU thread block
__global__ void compute_confidence_hists_kernel_many_blocks(const int img_win_width, const int img_block_width,
const int win_block_stride_x, const int win_block_stride_y,
const float* block_hists, const float* coefs,
float free_coef, float threshold, float* confidences)
{
const int win_x = threadIdx.z;
if (blockIdx.x * blockDim.z + win_x >= img_win_width)
return;
const int win_x = threadIdx.z;
if (blockIdx.x * blockDim.z + win_x >= img_win_width)
return;
const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width +
blockIdx.x * win_block_stride_x * blockDim.z + win_x) *
cblock_hist_size;
const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width +
blockIdx.x * win_block_stride_x * blockDim.z + win_x) *
cblock_hist_size;
float product = 0.f;
for (int i = threadIdx.x; i < cdescr_size; i += nthreads)
{
int offset_y = i / cdescr_width;
int offset_x = i - offset_y * cdescr_width;
product += coefs[i] * hist[offset_y * img_block_width * cblock_hist_size + offset_x];
}
float product = 0.f;
for (int i = threadIdx.x; i < cdescr_size; i += nthreads)
{
int offset_y = i / cdescr_width;
int offset_x = i - offset_y * cdescr_width;
product += coefs[i] * hist[offset_y * img_block_width * cblock_hist_size + offset_x];
}
__shared__ float products[nthreads * nblocks];
__shared__ float products[nthreads * nblocks];
const int tid = threadIdx.z * nthreads + threadIdx.x;
products[tid] = product;
const int tid = threadIdx.z * nthreads + threadIdx.x;
__syncthreads();
reduce<nthreads>(products, product, tid, plus<float>());
if (nthreads >= 512)
{
if (threadIdx.x < 256) products[tid] = product = product + products[tid + 256];
__syncthreads();
}
if (nthreads >= 256)
{
if (threadIdx.x < 128) products[tid] = product = product + products[tid + 128];
__syncthreads();
}
if (nthreads >= 128)
{
if (threadIdx.x < 64) products[tid] = product = product + products[tid + 64];
__syncthreads();
}
if (threadIdx.x < 32)
{
volatile float* smem = products;
if (nthreads >= 64) smem[tid] = product = product + smem[tid + 32];
if (nthreads >= 32) smem[tid] = product = product + smem[tid + 16];
if (nthreads >= 16) smem[tid] = product = product + smem[tid + 8];
if (nthreads >= 8) smem[tid] = product = product + smem[tid + 4];
if (nthreads >= 4) smem[tid] = product = product + smem[tid + 2];
if (nthreads >= 2) smem[tid] = product = product + smem[tid + 1];
}
if (threadIdx.x == 0)
confidences[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x]
= (float)(product + free_coef);
if (threadIdx.x == 0)
confidences[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x] = product + free_coef;
}
@@ -396,32 +367,32 @@ namespace cv { namespace gpu { namespace device
int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
float* coefs, float free_coef, float threshold, float *confidences)
{
const int nthreads = 256;
const int nblocks = 1;
const int nthreads = 256;
const int nblocks = 1;
int win_block_stride_x = win_stride_x / block_stride_x;
int win_block_stride_y = win_stride_y / block_stride_y;
int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
int win_block_stride_x = win_stride_x / block_stride_x;
int win_block_stride_y = win_stride_y / block_stride_y;
int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
dim3 threads(nthreads, 1, nblocks);
dim3 grid(divUp(img_win_width, nblocks), img_win_height);
dim3 threads(nthreads, 1, nblocks);
dim3 grid(divUp(img_win_width, nblocks), img_win_height);
cudaSafeCall(cudaFuncSetCacheConfig(compute_confidence_hists_kernel_many_blocks<nthreads, nblocks>,
cudaFuncCachePreferL1));
cudaSafeCall(cudaFuncSetCacheConfig(compute_confidence_hists_kernel_many_blocks<nthreads, nblocks>,
cudaFuncCachePreferL1));
int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) /
block_stride_x;
compute_confidence_hists_kernel_many_blocks<nthreads, nblocks><<<grid, threads>>>(
img_win_width, img_block_width, win_block_stride_x, win_block_stride_y,
block_hists, coefs, free_coef, threshold, confidences);
cudaSafeCall(cudaThreadSynchronize());
int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) /
block_stride_x;
compute_confidence_hists_kernel_many_blocks<nthreads, nblocks><<<grid, threads>>>(
img_win_width, img_block_width, win_block_stride_x, win_block_stride_y,
block_hists, coefs, free_coef, threshold, confidences);
cudaSafeCall(cudaThreadSynchronize());
}
template <int nthreads, // Number of threads per one histogram block
int nblocks> // Number of histogram block processed by single GPU thread block
int nblocks> // Number of histogram block processed by single GPU thread block
__global__ void classify_hists_kernel_many_blocks(const int img_win_width, const int img_block_width,
const int win_block_stride_x, const int win_block_stride_y,
const float* block_hists, const float* coefs,
@@ -446,36 +417,8 @@ namespace cv { namespace gpu { namespace device
__shared__ float products[nthreads * nblocks];
const int tid = threadIdx.z * nthreads + threadIdx.x;
products[tid] = product;
__syncthreads();
if (nthreads >= 512)
{
if (threadIdx.x < 256) products[tid] = product = product + products[tid + 256];
__syncthreads();
}
if (nthreads >= 256)
{
if (threadIdx.x < 128) products[tid] = product = product + products[tid + 128];
__syncthreads();
}
if (nthreads >= 128)
{
if (threadIdx.x < 64) products[tid] = product = product + products[tid + 64];
__syncthreads();
}
if (threadIdx.x < 32)
{
volatile float* smem = products;
if (nthreads >= 64) smem[tid] = product = product + smem[tid + 32];
if (nthreads >= 32) smem[tid] = product = product + smem[tid + 16];
if (nthreads >= 16) smem[tid] = product = product + smem[tid + 8];
if (nthreads >= 8) smem[tid] = product = product + smem[tid + 4];
if (nthreads >= 4) smem[tid] = product = product + smem[tid + 2];
if (nthreads >= 2) smem[tid] = product = product + smem[tid + 1];
}
reduce<nthreads>(products, product, tid, plus<float>());
if (threadIdx.x == 0)
labels[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x] = (product + free_coef >= threshold);
@@ -868,4 +811,4 @@ namespace cv { namespace gpu { namespace device
}}} // namespace cv { namespace gpu { namespace device
#endif /* CUDA_DISABLER */
#endif /* CUDA_DISABLER */

File diff suppressed because it is too large Load Diff

View File

@@ -43,11 +43,11 @@
#if !defined CUDA_DISABLER
#include "internal_shared.hpp"
#include "opencv2/gpu/device/common.hpp"
#include "opencv2/gpu/device/vec_traits.hpp"
#include "opencv2/gpu/device/vec_math.hpp"
#include "opencv2/gpu/device/block.hpp"
#include "opencv2/gpu/device/functional.hpp"
#include "opencv2/gpu/device/reduce.hpp"
#include "opencv2/gpu/device/border_interpolate.hpp"
using namespace cv::gpu;
@@ -184,6 +184,85 @@ namespace cv { namespace gpu { namespace device
{
namespace imgproc
{
template <int cn> struct Unroll;
template <> struct Unroll<1>
{
template <int BLOCK_SIZE>
static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*> smem_tuple(float* smem)
{
return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE);
}
static __device__ __forceinline__ thrust::tuple<float&, float&> tie(float& val1, float& val2)
{
return thrust::tie(val1, val2);
}
static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float> > op()
{
plus<float> op;
return thrust::make_tuple(op, op);
}
};
template <> struct Unroll<2>
{
template <int BLOCK_SIZE>
static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*, volatile float*> smem_tuple(float* smem)
{
return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE);
}
static __device__ __forceinline__ thrust::tuple<float&, float&, float&> tie(float& val1, float2& val2)
{
return thrust::tie(val1, val2.x, val2.y);
}
static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float>, plus<float> > op()
{
plus<float> op;
return thrust::make_tuple(op, op, op);
}
};
template <> struct Unroll<3>
{
template <int BLOCK_SIZE>
static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*, volatile float*, volatile float*> smem_tuple(float* smem)
{
return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE, smem + 3 * BLOCK_SIZE);
}
static __device__ __forceinline__ thrust::tuple<float&, float&, float&, float&> tie(float& val1, float3& val2)
{
return thrust::tie(val1, val2.x, val2.y, val2.z);
}
static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float>, plus<float>, plus<float> > op()
{
plus<float> op;
return thrust::make_tuple(op, op, op, op);
}
};
template <> struct Unroll<4>
{
template <int BLOCK_SIZE>
static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*, volatile float*, volatile float*, volatile float*> smem_tuple(float* smem)
{
return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE, smem + 3 * BLOCK_SIZE, smem + 4 * BLOCK_SIZE);
}
static __device__ __forceinline__ thrust::tuple<float&, float&, float&, float&, float&> tie(float& val1, float4& val2)
{
return thrust::tie(val1, val2.x, val2.y, val2.z, val2.w);
}
static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float>, plus<float>, plus<float>, plus<float> > op()
{
plus<float> op;
return thrust::make_tuple(op, op, op, op, op);
}
};
__device__ __forceinline__ int calcDist(const uchar& a, const uchar& b) { return (a-b)*(a-b); }
__device__ __forceinline__ int calcDist(const uchar2& a, const uchar2& b) { return (a.x-b.x)*(a.x-b.x) + (a.y-b.y)*(a.y-b.y); }
__device__ __forceinline__ int calcDist(const uchar3& a, const uchar3& b) { return (a.x-b.x)*(a.x-b.x) + (a.y-b.y)*(a.y-b.y) + (a.z-b.z)*(a.z-b.z); }
@@ -340,30 +419,15 @@ namespace cv { namespace gpu { namespace device
sum = sum + weight * saturate_cast<sum_type>(src(sy + y, sx + x));
}
volatile __shared__ float cta_buffer[CTA_SIZE];
__shared__ float cta_buffer[CTA_SIZE * (VecTraits<T>::cn + 1)];
int tid = threadIdx.x;
reduce<CTA_SIZE>(Unroll<VecTraits<T>::cn>::template smem_tuple<CTA_SIZE>(cta_buffer),
Unroll<VecTraits<T>::cn>::tie(weights_sum, sum),
threadIdx.x,
Unroll<VecTraits<T>::cn>::op());
cta_buffer[tid] = weights_sum;
__syncthreads();
Block::reduce<CTA_SIZE>(cta_buffer, plus());
weights_sum = cta_buffer[0];
__syncthreads();
for(int n = 0; n < VecTraits<T>::cn; ++n)
{
cta_buffer[tid] = reinterpret_cast<float*>(&sum)[n];
__syncthreads();
Block::reduce<CTA_SIZE>(cta_buffer, plus());
reinterpret_cast<float*>(&sum)[n] = cta_buffer[0];
__syncthreads();
}
if (tid == 0)
dst = saturate_cast<T>(sum/weights_sum);
if (threadIdx.x == 0)
dst = saturate_cast<T>(sum / weights_sum);
}
__device__ __forceinline__ void operator()(PtrStepSz<T>& dst) const
@@ -503,4 +567,4 @@ namespace cv { namespace gpu { namespace device
}}}
#endif /* CUDA_DISABLER */
#endif /* CUDA_DISABLER */

View File

@@ -47,10 +47,11 @@
#if !defined CUDA_DISABLER
#include <thrust/device_ptr.h>
#include <thrust/sort.h>
#include "opencv2/gpu/device/common.hpp"
#include "opencv2/gpu/device/utility.hpp"
#include "opencv2/gpu/device/reduce.hpp"
#include "opencv2/gpu/device/functional.hpp"
namespace cv { namespace gpu { namespace device
@@ -75,9 +76,9 @@ namespace cv { namespace gpu { namespace device
__global__ void HarrisResponses(const PtrStepb img, const short2* loc_, float* response, const int npoints, const int blockSize, const float harris_k)
{
__shared__ int smem[8 * 32];
volatile int* srow = smem + threadIdx.y * blockDim.x;
__shared__ int smem0[8 * 32];
__shared__ int smem1[8 * 32];
__shared__ int smem2[8 * 32];
const int ptidx = blockIdx.x * blockDim.y + threadIdx.y;
@@ -109,9 +110,12 @@ namespace cv { namespace gpu { namespace device
c += Ix * Iy;
}
reduce<32>(srow, a, threadIdx.x, plus<volatile int>());
reduce<32>(srow, b, threadIdx.x, plus<volatile int>());
reduce<32>(srow, c, threadIdx.x, plus<volatile int>());
int* srow0 = smem0 + threadIdx.y * blockDim.x;
int* srow1 = smem1 + threadIdx.y * blockDim.x;
int* srow2 = smem2 + threadIdx.y * blockDim.x;
plus<int> op;
reduce<32>(smem_tuple(srow0, srow1, srow2), thrust::tie(a, b, c), threadIdx.x, thrust::make_tuple(op, op, op));
if (threadIdx.x == 0)
{
@@ -151,9 +155,13 @@ namespace cv { namespace gpu { namespace device
__global__ void IC_Angle(const PtrStepb image, const short2* loc_, float* angle, const int npoints, const int half_k)
{
__shared__ int smem[8 * 32];
__shared__ int smem0[8 * 32];
__shared__ int smem1[8 * 32];
volatile int* srow = smem + threadIdx.y * blockDim.x;
int* srow0 = smem0 + threadIdx.y * blockDim.x;
int* srow1 = smem1 + threadIdx.y * blockDim.x;
plus<int> op;
const int ptidx = blockIdx.x * blockDim.y + threadIdx.y;
@@ -167,7 +175,7 @@ namespace cv { namespace gpu { namespace device
for (int u = threadIdx.x - half_k; u <= half_k; u += blockDim.x)
m_10 += u * image(loc.y, loc.x + u);
reduce<32>(srow, m_10, threadIdx.x, plus<volatile int>());
reduce<32>(srow0, m_10, threadIdx.x, op);
for (int v = 1; v <= half_k; ++v)
{
@@ -185,8 +193,7 @@ namespace cv { namespace gpu { namespace device
m_sum += u * (val_plus + val_minus);
}
reduce<32>(srow, v_sum, threadIdx.x, plus<volatile int>());
reduce<32>(srow, m_sum, threadIdx.x, plus<volatile int>());
reduce<32>(smem_tuple(srow0, srow1), thrust::tie(v_sum, m_sum), threadIdx.x, thrust::make_tuple(op, op));
m_10 += m_sum;
m_01 += v * v_sum;
@@ -419,4 +426,4 @@ namespace cv { namespace gpu { namespace device
}
}}}
#endif /* CUDA_DISABLER */
#endif /* CUDA_DISABLER */

File diff suppressed because it is too large Load Diff

View File

@@ -69,7 +69,7 @@ namespace cv { namespace gpu { namespace device
template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherStream
{
static void call(PtrStepSz<T> src, PtrStepSzf mapx, PtrStepSzf mapy, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int)
static void call(PtrStepSz<T> src, PtrStepSzf mapx, PtrStepSzf mapy, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool)
{
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
@@ -87,7 +87,7 @@ namespace cv { namespace gpu { namespace device
template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherNonStream
{
static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, PtrStepSz<T> dst, const float* borderValue, int)
static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, PtrStepSz<T> dst, const float* borderValue, bool)
{
(void)srcWhole;
(void)xoff;
@@ -124,10 +124,10 @@ namespace cv { namespace gpu { namespace device
template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, type> \
{ \
static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, \
PtrStepSz< type > dst, const float* borderValue, int cc) \
PtrStepSz< type > dst, const float* borderValue, bool cc20) \
{ \
typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
dim3 block(32, cc >= 20 ? 8 : 4); \
dim3 block(32, cc20 ? 8 : 4); \
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
bindTexture(&tex_remap_ ## type , srcWhole); \
tex_remap_ ## type ##_reader texSrc(xoff, yoff); \
@@ -142,7 +142,7 @@ namespace cv { namespace gpu { namespace device
template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, type> \
{ \
static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, \
PtrStepSz< type > dst, const float*, int) \
PtrStepSz< type > dst, const float*, bool) \
{ \
dim3 block(32, 8); \
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
@@ -194,20 +194,20 @@ namespace cv { namespace gpu { namespace device
template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcher
{
static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy,
PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int cc)
PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20)
{
if (stream == 0)
RemapDispatcherNonStream<Filter, B, T>::call(src, srcWhole, xoff, yoff, mapx, mapy, dst, borderValue, cc);
RemapDispatcherNonStream<Filter, B, T>::call(src, srcWhole, xoff, yoff, mapx, mapy, dst, borderValue, cc20);
else
RemapDispatcherStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, stream, cc);
RemapDispatcherStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, stream, cc20);
}
};
template <typename T> void remap_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap,
PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc)
PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20)
{
typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap,
PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int cc);
PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20);
static const caller_t callers[3][5] =
{
@@ -235,40 +235,40 @@ namespace cv { namespace gpu { namespace device
};
callers[interpolation][borderMode](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff, xmap, ymap,
static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc);
static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc20);
}
template void remap_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void remap_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void remap_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void remap_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void remap_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void remap_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void remap_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void remap_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void remap_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void remap_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void remap_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void remap_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void remap_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void remap_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void remap_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void remap_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void remap_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void remap_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void remap_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void remap_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void remap_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void remap_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void remap_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void remap_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void remap_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void remap_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void remap_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void remap_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void remap_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void remap_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void remap_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void remap_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void remap_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void remap_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void remap_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void remap_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void remap_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void remap_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void remap_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void remap_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void remap_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void remap_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void remap_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void remap_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void remap_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void remap_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void remap_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void remap_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
} // namespace imgproc
}}} // namespace cv { namespace gpu { namespace device
#endif /* CUDA_DISABLER */
#endif /* CUDA_DISABLER */

View File

@@ -42,9 +42,11 @@
#if !defined CUDA_DISABLER
#include "internal_shared.hpp"
#include "opencv2/gpu/device/common.hpp"
#include "opencv2/gpu/device/saturate_cast.hpp"
#include "opencv2/gpu/device/limits.hpp"
#include "opencv2/gpu/device/reduce.hpp"
#include "opencv2/gpu/device/functional.hpp"
namespace cv { namespace gpu { namespace device
{
@@ -297,28 +299,13 @@ namespace cv { namespace gpu { namespace device
}
extern __shared__ float smem[];
float* dline = smem + winsz * threadIdx.z;
dline[tid] = val;
__syncthreads();
if (winsz >= 256) { if (tid < 128) { dline[tid] += dline[tid + 128]; } __syncthreads(); }
if (winsz >= 128) { if (tid < 64) { dline[tid] += dline[tid + 64]; } __syncthreads(); }
volatile float* vdline = smem + winsz * threadIdx.z;
if (winsz >= 64) if (tid < 32) vdline[tid] += vdline[tid + 32];
if (winsz >= 32) if (tid < 16) vdline[tid] += vdline[tid + 16];
if (winsz >= 16) if (tid < 8) vdline[tid] += vdline[tid + 8];
if (winsz >= 8) if (tid < 4) vdline[tid] += vdline[tid + 4];
if (winsz >= 4) if (tid < 2) vdline[tid] += vdline[tid + 2];
if (winsz >= 2) if (tid < 1) vdline[tid] += vdline[tid + 1];
reduce<winsz>(smem + winsz * threadIdx.z, val, tid, plus<float>());
T* data_cost = (T*)ctemp + y_out * cmsg_step + x_out;
if (tid == 0)
data_cost[cdisp_step1 * d] = saturate_cast<T>(dline[0]);
data_cost[cdisp_step1 * d] = saturate_cast<T>(val);
}
}
@@ -496,26 +483,11 @@ namespace cv { namespace gpu { namespace device
}
extern __shared__ float smem[];
float* dline = smem + winsz * threadIdx.z;
dline[tid] = val;
__syncthreads();
if (winsz >= 256) { if (tid < 128) { dline[tid] += dline[tid + 128]; } __syncthreads(); }
if (winsz >= 128) { if (tid < 64) { dline[tid] += dline[tid + 64]; } __syncthreads(); }
volatile float* vdline = smem + winsz * threadIdx.z;
if (winsz >= 64) if (tid < 32) vdline[tid] += vdline[tid + 32];
if (winsz >= 32) if (tid < 16) vdline[tid] += vdline[tid + 16];
if (winsz >= 16) if (tid < 8) vdline[tid] += vdline[tid + 8];
if (winsz >= 8) if (tid < 4) vdline[tid] += vdline[tid + 4];
if (winsz >= 4) if (tid < 2) vdline[tid] += vdline[tid + 2];
if (winsz >= 2) if (tid < 1) vdline[tid] += vdline[tid + 1];
reduce<winsz>(smem + winsz * threadIdx.z, val, tid, plus<float>());
if (tid == 0)
data_cost[cdisp_step1 * d] = saturate_cast<T>(dline[0]);
data_cost[cdisp_step1 * d] = saturate_cast<T>(val);
}
}
@@ -889,4 +861,4 @@ namespace cv { namespace gpu { namespace device
} // namespace stereocsbp
}}} // namespace cv { namespace gpu { namespace device {
#endif /* CUDA_DISABLER */
#endif /* CUDA_DISABLER */

View File

@@ -47,13 +47,13 @@
#if !defined CUDA_DISABLER
#include "internal_shared.hpp"
#include "opencv2/gpu/device/common.hpp"
#include "opencv2/gpu/device/limits.hpp"
#include "opencv2/gpu/device/saturate_cast.hpp"
#include "opencv2/gpu/device/reduce.hpp"
#include "opencv2/gpu/device/utility.hpp"
#include "opencv2/gpu/device/functional.hpp"
#include "opencv2/gpu/device/filters.hpp"
#include <float.h>
namespace cv { namespace gpu { namespace device
{
@@ -568,7 +568,9 @@ namespace cv { namespace gpu { namespace device
float bestx = 0, besty = 0, best_mod = 0;
#if __CUDA_ARCH__ >= 200
#pragma unroll
#endif
for (int i = 0; i < 18; ++i)
{
const int dir = (i * 4 + threadIdx.y) * ORI_SEARCH_INC;
@@ -599,8 +601,9 @@ namespace cv { namespace gpu { namespace device
sumy += s_Y[threadIdx.x + 96];
}
device::reduce<32>(s_sumx + threadIdx.y * 32, sumx, threadIdx.x, plus<volatile float>());
device::reduce<32>(s_sumy + threadIdx.y * 32, sumy, threadIdx.x, plus<volatile float>());
plus<float> op;
device::reduce<32>(smem_tuple(s_sumx + threadIdx.y * 32, s_sumy + threadIdx.y * 32),
thrust::tie(sumx, sumy), threadIdx.x, thrust::make_tuple(op, op));
const float temp_mod = sumx * sumx + sumy * sumy;
if (temp_mod > best_mod)
@@ -638,7 +641,7 @@ namespace cv { namespace gpu { namespace device
kp_dir *= 180.0f / CV_PI_F;
kp_dir = 360.0f - kp_dir;
if (::fabsf(kp_dir - 360.f) < FLT_EPSILON)
if (::fabsf(kp_dir - 360.f) < numeric_limits<float>::epsilon())
kp_dir = 0.f;
featureDir[blockIdx.x] = kp_dir;
@@ -697,11 +700,6 @@ namespace cv { namespace gpu { namespace device
{
typedef uchar elem_type;
__device__ __forceinline__ WinReader(float centerX_, float centerY_, float win_offset_, float cos_dir_, float sin_dir_) :
centerX(centerX_), centerY(centerY_), win_offset(win_offset_), cos_dir(cos_dir_), sin_dir(sin_dir_)
{
}
__device__ __forceinline__ uchar operator ()(int i, int j) const
{
float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;
@@ -715,285 +713,215 @@ namespace cv { namespace gpu { namespace device
float win_offset;
float cos_dir;
float sin_dir;
int width;
int height;
};
__device__ void calc_dx_dy(float s_dx_bin[25], float s_dy_bin[25],
const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
__device__ void calc_dx_dy(const float* featureX, const float* featureY, const float* featureSize, const float* featureDir,
float& dx, float& dy)
{
__shared__ float s_PATCH[6][6];
__shared__ float s_PATCH[PATCH_SZ + 1][PATCH_SZ + 1];
const float centerX = featureX[blockIdx.x];
const float centerY = featureY[blockIdx.x];
const float size = featureSize[blockIdx.x];
float descriptor_dir = 360.0f - featureDir[blockIdx.x];
if (std::abs(descriptor_dir - 360.f) < FLT_EPSILON)
descriptor_dir = 0.f;
descriptor_dir *= (float)(CV_PI_F / 180.0f);
dx = dy = 0.0f;
/* The sampling intervals and wavelet sized for selecting an orientation
and building the keypoint descriptor are defined relative to 's' */
const float s = size * 1.2f / 9.0f;
WinReader win;
/* Extract a window of pixels around the keypoint of size 20s */
win.centerX = featureX[blockIdx.x];
win.centerY = featureY[blockIdx.x];
// The sampling intervals and wavelet sized for selecting an orientation
// and building the keypoint descriptor are defined relative to 's'
const float s = featureSize[blockIdx.x] * 1.2f / 9.0f;
// Extract a window of pixels around the keypoint of size 20s
const int win_size = (int)((PATCH_SZ + 1) * s);
float sin_dir;
float cos_dir;
sincosf(descriptor_dir, &sin_dir, &cos_dir);
win.width = win.height = win_size;
/* Nearest neighbour version (faster) */
const float win_offset = -(float)(win_size - 1) / 2;
// Compute sampling points
// since grids are 2D, need to compute xBlock and yBlock indices
const int xBlock = (blockIdx.y & 3); // blockIdx.y % 4
const int yBlock = (blockIdx.y >> 2); // floor(blockIdx.y/4)
const int xIndex = xBlock * 5 + threadIdx.x;
const int yIndex = yBlock * 5 + threadIdx.y;
const float icoo = ((float)yIndex / (PATCH_SZ + 1)) * win_size;
const float jcoo = ((float)xIndex / (PATCH_SZ + 1)) * win_size;
LinearFilter<WinReader> filter(WinReader(centerX, centerY, win_offset, cos_dir, sin_dir));
s_PATCH[threadIdx.y][threadIdx.x] = filter(icoo, jcoo);
__syncthreads();
if (threadIdx.x < 5 && threadIdx.y < 5)
{
const int tid = threadIdx.y * 5 + threadIdx.x;
const float dw = c_DW[yIndex * PATCH_SZ + xIndex];
const float vx = (s_PATCH[threadIdx.y ][threadIdx.x + 1] - s_PATCH[threadIdx.y][threadIdx.x] + s_PATCH[threadIdx.y + 1][threadIdx.x + 1] - s_PATCH[threadIdx.y + 1][threadIdx.x ]) * dw;
const float vy = (s_PATCH[threadIdx.y + 1][threadIdx.x ] - s_PATCH[threadIdx.y][threadIdx.x] + s_PATCH[threadIdx.y + 1][threadIdx.x + 1] - s_PATCH[threadIdx.y ][threadIdx.x + 1]) * dw;
s_dx_bin[tid] = vx;
s_dy_bin[tid] = vy;
}
}
__device__ void reduce_sum25(volatile float* sdata1, volatile float* sdata2, volatile float* sdata3, volatile float* sdata4, int tid)
{
// first step is to reduce from 25 to 16
if (tid < 9) // use 9 threads
{
sdata1[tid] += sdata1[tid + 16];
sdata2[tid] += sdata2[tid + 16];
sdata3[tid] += sdata3[tid + 16];
sdata4[tid] += sdata4[tid + 16];
}
// sum (reduce) from 16 to 1 (unrolled - aligned to a half-warp)
if (tid < 8)
{
sdata1[tid] += sdata1[tid + 8];
sdata1[tid] += sdata1[tid + 4];
sdata1[tid] += sdata1[tid + 2];
sdata1[tid] += sdata1[tid + 1];
sdata2[tid] += sdata2[tid + 8];
sdata2[tid] += sdata2[tid + 4];
sdata2[tid] += sdata2[tid + 2];
sdata2[tid] += sdata2[tid + 1];
sdata3[tid] += sdata3[tid + 8];
sdata3[tid] += sdata3[tid + 4];
sdata3[tid] += sdata3[tid + 2];
sdata3[tid] += sdata3[tid + 1];
sdata4[tid] += sdata4[tid + 8];
sdata4[tid] += sdata4[tid + 4];
sdata4[tid] += sdata4[tid + 2];
sdata4[tid] += sdata4[tid + 1];
}
}
__global__ void compute_descriptors64(PtrStepf descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
{
// 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
__shared__ float sdx[25];
__shared__ float sdy[25];
__shared__ float sdxabs[25];
__shared__ float sdyabs[25];
calc_dx_dy(sdx, sdy, featureX, featureY, featureSize, featureDir);
__syncthreads();
// Nearest neighbour version (faster)
win.win_offset = -(win_size - 1.0f) / 2.0f;
float descriptor_dir = 360.0f - featureDir[blockIdx.x];
if (::fabsf(descriptor_dir - 360.f) < numeric_limits<float>::epsilon())
descriptor_dir = 0.f;
descriptor_dir *= CV_PI_F / 180.0f;
sincosf(descriptor_dir, &win.sin_dir, &win.cos_dir);
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
if (tid < 25)
const int xLoadInd = tid % (PATCH_SZ + 1);
const int yLoadInd = tid / (PATCH_SZ + 1);
if (yLoadInd < (PATCH_SZ + 1))
{
sdxabs[tid] = ::fabs(sdx[tid]); // |dx| array
sdyabs[tid] = ::fabs(sdy[tid]); // |dy| array
__syncthreads();
reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid);
__syncthreads();
float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 2);
// write dx, dy, |dx|, |dy|
if (tid == 0)
if (s > 1)
{
descriptors_block[0] = sdx[0];
descriptors_block[1] = sdy[0];
descriptors_block[2] = sdxabs[0];
descriptors_block[3] = sdyabs[0];
AreaFilter<WinReader> filter(win, s, s);
s_PATCH[yLoadInd][xLoadInd] = filter(yLoadInd, xLoadInd);
}
else
{
LinearFilter<WinReader> filter(win);
s_PATCH[yLoadInd][xLoadInd] = filter(yLoadInd * s, xLoadInd * s);
}
}
__syncthreads();
const int xPatchInd = threadIdx.x % 5;
const int yPatchInd = threadIdx.x / 5;
if (yPatchInd < 5)
{
const int xBlockInd = threadIdx.y % 4;
const int yBlockInd = threadIdx.y / 4;
const int xInd = xBlockInd * 5 + xPatchInd;
const int yInd = yBlockInd * 5 + yPatchInd;
const float dw = c_DW[yInd * PATCH_SZ + xInd];
dx = (s_PATCH[yInd ][xInd + 1] - s_PATCH[yInd][xInd] + s_PATCH[yInd + 1][xInd + 1] - s_PATCH[yInd + 1][xInd ]) * dw;
dy = (s_PATCH[yInd + 1][xInd ] - s_PATCH[yInd][xInd] + s_PATCH[yInd + 1][xInd + 1] - s_PATCH[yInd ][xInd + 1]) * dw;
}
}
__global__ void compute_descriptors128(PtrStepf descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
__global__ void compute_descriptors_64(PtrStep<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
{
// 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
__shared__ float sdx[25];
__shared__ float sdy[25];
__shared__ float smem[32 * 16];
// sum (reduce) 5x5 area response
__shared__ float sd1[25];
__shared__ float sd2[25];
__shared__ float sdabs1[25];
__shared__ float sdabs2[25];
float* sRow = smem + threadIdx.y * 32;
calc_dx_dy(sdx, sdy, featureX, featureY, featureSize, featureDir);
__syncthreads();
float dx, dy;
calc_dx_dy(featureX, featureY, featureSize, featureDir, dx, dy);
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
float dxabs = ::fabsf(dx);
float dyabs = ::fabsf(dy);
if (tid < 25)
plus<float> op;
reduce<32>(sRow, dx, threadIdx.x, op);
reduce<32>(sRow, dy, threadIdx.x, op);
reduce<32>(sRow, dxabs, threadIdx.x, op);
reduce<32>(sRow, dyabs, threadIdx.x, op);
float4* descriptors_block = descriptors.ptr(blockIdx.x) + threadIdx.y;
// write dx, dy, |dx|, |dy|
if (threadIdx.x == 0)
*descriptors_block = make_float4(dx, dy, dxabs, dyabs);
}
__global__ void compute_descriptors_128(PtrStep<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
{
__shared__ float smem[32 * 16];
float* sRow = smem + threadIdx.y * 32;
float dx, dy;
calc_dx_dy(featureX, featureY, featureSize, featureDir, dx, dy);
float4* descriptors_block = descriptors.ptr(blockIdx.x) + threadIdx.y * 2;
plus<float> op;
float d1 = 0.0f;
float d2 = 0.0f;
float abs1 = 0.0f;
float abs2 = 0.0f;
if (dy >= 0)
{
if (sdy[tid] >= 0)
{
sd1[tid] = sdx[tid];
sdabs1[tid] = ::fabs(sdx[tid]);
sd2[tid] = 0;
sdabs2[tid] = 0;
}
else
{
sd1[tid] = 0;
sdabs1[tid] = 0;
sd2[tid] = sdx[tid];
sdabs2[tid] = ::fabs(sdx[tid]);
}
__syncthreads();
reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
__syncthreads();
float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 3);
// write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0)
if (tid == 0)
{
descriptors_block[0] = sd1[0];
descriptors_block[1] = sdabs1[0];
descriptors_block[2] = sd2[0];
descriptors_block[3] = sdabs2[0];
}
__syncthreads();
if (sdx[tid] >= 0)
{
sd1[tid] = sdy[tid];
sdabs1[tid] = ::fabs(sdy[tid]);
sd2[tid] = 0;
sdabs2[tid] = 0;
}
else
{
sd1[tid] = 0;
sdabs1[tid] = 0;
sd2[tid] = sdy[tid];
sdabs2[tid] = ::fabs(sdy[tid]);
}
__syncthreads();
reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
__syncthreads();
// write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)
if (tid == 0)
{
descriptors_block[4] = sd1[0];
descriptors_block[5] = sdabs1[0];
descriptors_block[6] = sd2[0];
descriptors_block[7] = sdabs2[0];
}
d1 = dx;
abs1 = ::fabsf(dx);
}
else
{
d2 = dx;
abs2 = ::fabsf(dx);
}
reduce<32>(sRow, d1, threadIdx.x, op);
reduce<32>(sRow, d2, threadIdx.x, op);
reduce<32>(sRow, abs1, threadIdx.x, op);
reduce<32>(sRow, abs2, threadIdx.x, op);
// write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0)
if (threadIdx.x == 0)
descriptors_block[0] = make_float4(d1, abs1, d2, abs2);
if (dx >= 0)
{
d1 = dy;
abs1 = ::fabsf(dy);
d2 = 0.0f;
abs2 = 0.0f;
}
else
{
d1 = 0.0f;
abs1 = 0.0f;
d2 = dy;
abs2 = ::fabsf(dy);
}
reduce<32>(sRow, d1, threadIdx.x, op);
reduce<32>(sRow, d2, threadIdx.x, op);
reduce<32>(sRow, abs1, threadIdx.x, op);
reduce<32>(sRow, abs2, threadIdx.x, op);
// write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)
if (threadIdx.x == 0)
descriptors_block[1] = make_float4(d1, abs1, d2, abs2);
}
template <int BLOCK_DIM_X> __global__ void normalize_descriptors(PtrStepf descriptors)
{
__shared__ float smem[BLOCK_DIM_X];
__shared__ float s_len;
// no need for thread ID
float* descriptor_base = descriptors.ptr(blockIdx.x);
// read in the unnormalized descriptor values (squared)
__shared__ float sqDesc[BLOCK_DIM_X];
const float lookup = descriptor_base[threadIdx.x];
sqDesc[threadIdx.x] = lookup * lookup;
__syncthreads();
const float val = descriptor_base[threadIdx.x];
if (BLOCK_DIM_X >= 128)
{
if (threadIdx.x < 64)
sqDesc[threadIdx.x] += sqDesc[threadIdx.x + 64];
__syncthreads();
}
float len = val * val;
reduce<BLOCK_DIM_X>(smem, len, threadIdx.x, plus<float>());
// reduction to get total
if (threadIdx.x < 32)
{
volatile float* smem = sqDesc;
smem[threadIdx.x] += smem[threadIdx.x + 32];
smem[threadIdx.x] += smem[threadIdx.x + 16];
smem[threadIdx.x] += smem[threadIdx.x + 8];
smem[threadIdx.x] += smem[threadIdx.x + 4];
smem[threadIdx.x] += smem[threadIdx.x + 2];
smem[threadIdx.x] += smem[threadIdx.x + 1];
}
// compute length (square root)
__shared__ float len;
if (threadIdx.x == 0)
{
len = sqrtf(sqDesc[0]);
}
s_len = ::sqrtf(len);
__syncthreads();
// normalize and store in output
descriptor_base[threadIdx.x] = lookup / len;
descriptor_base[threadIdx.x] = val / s_len;
}
void compute_descriptors_gpu(const PtrStepSzf& descriptors,
const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures)
void compute_descriptors_gpu(PtrStepSz<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures)
{
// compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D
if (descriptors.cols == 64)
{
compute_descriptors64<<<dim3(nFeatures, 16, 1), dim3(6, 6, 1)>>>(descriptors, featureX, featureY, featureSize, featureDir);
compute_descriptors_64<<<nFeatures, dim3(32, 16)>>>(descriptors, featureX, featureY, featureSize, featureDir);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
normalize_descriptors<64><<<dim3(nFeatures, 1, 1), dim3(64, 1, 1)>>>(descriptors);
normalize_descriptors<64><<<nFeatures, 64>>>((PtrStepSzf) descriptors);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
else
{
compute_descriptors128<<<dim3(nFeatures, 16, 1), dim3(6, 6, 1)>>>(descriptors, featureX, featureY, featureSize, featureDir);
compute_descriptors_128<<<nFeatures, dim3(32, 16)>>>(descriptors, featureX, featureY, featureSize, featureDir);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
normalize_descriptors<128><<<dim3(nFeatures, 1, 1), dim3(128, 1, 1)>>>(descriptors);
normalize_descriptors<128><<<nFeatures, 128>>>((PtrStepSzf) descriptors);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );

View File

@@ -140,7 +140,7 @@ namespace cv { namespace gpu { namespace device
template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcherStream
{
static void call(PtrStepSz<T> src, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int)
static void call(PtrStepSz<T> src, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool)
{
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
@@ -158,7 +158,7 @@ namespace cv { namespace gpu { namespace device
template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcherNonStream
{
static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, int)
static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, bool)
{
(void)xoff;
(void)yoff;
@@ -195,10 +195,10 @@ namespace cv { namespace gpu { namespace device
}; \
template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, type> \
{ \
static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float* borderValue, int cc) \
static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float* borderValue, bool cc20) \
{ \
typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
dim3 block(32, cc >= 20 ? 8 : 4); \
dim3 block(32, cc20 ? 8 : 4); \
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
bindTexture(&tex_warp_ ## type , srcWhole); \
tex_warp_ ## type ##_reader texSrc(xoff, yoff); \
@@ -212,7 +212,7 @@ namespace cv { namespace gpu { namespace device
}; \
template <class Transform, template <typename> class Filter> struct WarpDispatcherNonStream<Transform, Filter, BrdReplicate, type> \
{ \
static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float*, int) \
static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float*, bool) \
{ \
dim3 block(32, 8); \
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
@@ -263,20 +263,20 @@ namespace cv { namespace gpu { namespace device
template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcher
{
static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int cc)
static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20)
{
if (stream == 0)
WarpDispatcherNonStream<Transform, Filter, B, T>::call(src, srcWhole, xoff, yoff, dst, borderValue, cc);
WarpDispatcherNonStream<Transform, Filter, B, T>::call(src, srcWhole, xoff, yoff, dst, borderValue, cc20);
else
WarpDispatcherStream<Transform, Filter, B, T>::call(src, dst, borderValue, stream, cc);
WarpDispatcherStream<Transform, Filter, B, T>::call(src, dst, borderValue, stream, cc20);
}
};
template <class Transform, typename T>
void warp_caller(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzb dst, int interpolation,
int borderMode, const float* borderValue, cudaStream_t stream, int cc)
int borderMode, const float* borderValue, cudaStream_t stream, bool cc20)
{
typedef void (*func_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int cc);
typedef void (*func_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20);
static const func_t funcs[3][5] =
{
@@ -304,86 +304,86 @@ namespace cv { namespace gpu { namespace device
};
funcs[interpolation][borderMode](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff,
static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc);
static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc20);
}
template <typename T> void warpAffine_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
int borderMode, const float* borderValue, cudaStream_t stream, int cc)
int borderMode, const float* borderValue, cudaStream_t stream, bool cc20)
{
cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 2 * 3 * sizeof(float)) );
warp_caller<AffineTransform, T>(src, srcWhole, xoff, yoff, dst, interpolation, borderMode, borderValue, stream, cc);
warp_caller<AffineTransform, T>(src, srcWhole, xoff, yoff, dst, interpolation, borderMode, borderValue, stream, cc20);
}
template void warpAffine_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void warpAffine_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void warpAffine_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void warpAffine_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void warpAffine_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpAffine_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpAffine_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpAffine_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpAffine_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void warpAffine_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void warpAffine_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void warpAffine_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void warpAffine_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpAffine_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpAffine_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpAffine_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpAffine_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void warpAffine_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void warpAffine_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void warpAffine_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void warpAffine_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpAffine_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpAffine_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpAffine_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpAffine_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void warpAffine_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void warpAffine_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void warpAffine_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void warpAffine_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpAffine_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpAffine_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpAffine_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpAffine_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void warpAffine_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void warpAffine_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void warpAffine_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void warpAffine_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpAffine_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpAffine_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpAffine_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpAffine_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void warpAffine_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void warpAffine_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void warpAffine_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void warpAffine_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpAffine_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpAffine_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpAffine_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template <typename T> void warpPerspective_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation,
int borderMode, const float* borderValue, cudaStream_t stream, int cc)
int borderMode, const float* borderValue, cudaStream_t stream, bool cc20)
{
cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 3 * 3 * sizeof(float)) );
warp_caller<PerspectiveTransform, T>(src, srcWhole, xoff, yoff, dst, interpolation, borderMode, borderValue, stream, cc);
warp_caller<PerspectiveTransform, T>(src, srcWhole, xoff, yoff, dst, interpolation, borderMode, borderValue, stream, cc20);
}
template void warpPerspective_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void warpPerspective_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void warpPerspective_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void warpPerspective_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void warpPerspective_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpPerspective_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpPerspective_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpPerspective_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpPerspective_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void warpPerspective_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void warpPerspective_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void warpPerspective_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void warpPerspective_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpPerspective_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpPerspective_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpPerspective_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpPerspective_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void warpPerspective_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void warpPerspective_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void warpPerspective_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void warpPerspective_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpPerspective_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpPerspective_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpPerspective_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpPerspective_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void warpPerspective_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void warpPerspective_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void warpPerspective_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void warpPerspective_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpPerspective_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpPerspective_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpPerspective_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpPerspective_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void warpPerspective_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void warpPerspective_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void warpPerspective_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void warpPerspective_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpPerspective_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpPerspective_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpPerspective_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpPerspective_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void warpPerspective_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void warpPerspective_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void warpPerspective_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void warpPerspective_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpPerspective_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpPerspective_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpPerspective_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
} // namespace imgproc
}}} // namespace cv { namespace gpu { namespace device
#endif /* CUDA_DISABLER */
#endif /* CUDA_DISABLER */

File diff suppressed because it is too large Load Diff

View File

@@ -336,7 +336,7 @@ namespace
{
void calcDiffHistogram(const cv::gpu::GpuMat& prevFrame, const cv::gpu::GpuMat& curFrame, cv::gpu::GpuMat& hist, cv::gpu::GpuMat& histBuf)
{
typedef void (*func_t)(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, int cc, cudaStream_t stream);
typedef void (*func_t)(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
static const func_t funcs[4][4] =
{
{0,0,0,0},
@@ -348,14 +348,11 @@ namespace
hist.create(3, 256, CV_32SC1);
histBuf.create(3, bgfg::PARTIAL_HISTOGRAM_COUNT * bgfg::HISTOGRAM_BIN_COUNT, CV_32SC1);
cv::gpu::DeviceInfo devInfo;
int cc = devInfo.majorVersion() * 10 + devInfo.minorVersion();
funcs[prevFrame.channels() - 1][curFrame.channels() - 1](
prevFrame, curFrame,
hist.ptr<unsigned int>(0), hist.ptr<unsigned int>(1), hist.ptr<unsigned int>(2),
histBuf.ptr<unsigned int>(0), histBuf.ptr<unsigned int>(1), histBuf.ptr<unsigned int>(2),
cc, 0);
cv::gpu::deviceSupports(cv::gpu::FEATURE_SET_COMPUTE_20), 0);
}
void calcRelativeVariance(unsigned int hist[3 * 256], double relativeVariance[3][bgfg::HISTOGRAM_BIN_COUNT])
@@ -526,15 +523,15 @@ namespace
size_t total = all_contours.size();
_contours.create(total, 1, 0, -1, true);
_contours.create((int) total, 1, 0, -1, true);
cv::SeqIterator<CvSeq*> it = all_contours.begin();
for (size_t i = 0; i < total; ++i, ++it)
{
CvSeq* c = *it;
((CvContour*)c)->color = (int)i;
_contours.create((int)c->total, 1, CV_32SC2, i, true);
cv::Mat ci = _contours.getMat(i);
_contours.create((int)c->total, 1, CV_32SC2, (int)i, true);
cv::Mat ci = _contours.getMat((int)i);
CV_Assert( ci.isContinuous() );
cvCvtSeqToArray(c, ci.data);
}

View File

@@ -294,9 +294,8 @@ void cv::gpu::HoughCircles(const GpuMat& src, GpuMat& circles, HoughCirclesBuf&
ensureSizeIsEnough(1, maxCircles, CV_32FC3, circles);
DeviceInfo devInfo;
const int circlesCount = circlesAccumRadius_gpu(centers, centersCount, srcPoints, pointsCount, circles.ptr<float3>(), maxCircles,
dp, minRadius, maxRadius, votesThreshold, devInfo.supports(FEATURE_SET_COMPUTE_20));
dp, minRadius, maxRadius, votesThreshold, deviceSupports(FEATURE_SET_COMPUTE_20));
if (circlesCount > 0)
circles.cols = circlesCount;
@@ -531,7 +530,7 @@ namespace
const func_t func = funcs[dx.depth()];
CV_Assert(func != 0);
edgePointList.cols = edgePointList.step / sizeof(int);
edgePointList.cols = (int) (edgePointList.step / sizeof(int));
ensureSizeIsEnough(2, edges.size().area(), CV_32SC1, edgePointList);
edgePointList.cols = func(edges, dx, dy, edgePointList.ptr<unsigned int>(0), edgePointList.ptr<float>(1));

View File

@@ -547,14 +547,13 @@ void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, S
cudaStream_t stream = StreamAccessor::getStream(s);
DeviceInfo info;
cv::Size whole;
cv::Point offset;
src.locateROI(whole, offset);
if (info.supports(WARP_SHUFFLE_FUNCTIONS) && src.cols <= 2048
&& offset.x % 16 == 0 && ((src.cols + 63) / 64) * 64 <= (src.step - offset.x))
if (deviceSupports(WARP_SHUFFLE_FUNCTIONS) && src.cols <= 2048
&& offset.x % 16 == 0 && ((src.cols + 63) / 64) * 64 <= (static_cast<int>(src.step) - offset.x))
{
ensureSizeIsEnough(((src.rows + 7) / 8) * 8, ((src.cols + 63) / 64) * 64, CV_32SC1, buffer);
@@ -972,36 +971,26 @@ void cv::gpu::histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4
hist_callers[src.depth()](src, hist, levels, buf, StreamAccessor::getStream(stream));
}
namespace cv { namespace gpu { namespace device
namespace hist
{
namespace hist
{
void histogram256_gpu(PtrStepSzb src, int* hist, unsigned int* buf, cudaStream_t stream);
const int PARTIAL_HISTOGRAM256_COUNT = 240;
const int HISTOGRAM256_BIN_COUNT = 256;
void equalizeHist_gpu(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream);
}
}}}
void histogram256(PtrStepSzb src, int* hist, cudaStream_t stream);
void equalizeHist(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream);
}
void cv::gpu::calcHist(const GpuMat& src, GpuMat& hist, Stream& stream)
{
GpuMat buf;
calcHist(src, hist, buf, stream);
CV_Assert(src.type() == CV_8UC1);
hist.create(1, 256, CV_32SC1);
hist.setTo(Scalar::all(0));
hist::histogram256(src, hist.ptr<int>(), StreamAccessor::getStream(stream));
}
void cv::gpu::calcHist(const GpuMat& src, GpuMat& hist, GpuMat& buf, Stream& stream)
{
using namespace ::cv::gpu::device::hist;
CV_Assert(src.type() == CV_8UC1);
hist.create(1, 256, CV_32SC1);
ensureSizeIsEnough(1, PARTIAL_HISTOGRAM256_COUNT * HISTOGRAM256_BIN_COUNT, CV_32SC1, buf);
histogram256_gpu(src, hist.ptr<int>(), buf.ptr<unsigned int>(), StreamAccessor::getStream(stream));
(void) buf;
calcHist(src, hist, stream);
}
void cv::gpu::equalizeHist(const GpuMat& src, GpuMat& dst, Stream& stream)
@@ -1019,8 +1008,6 @@ void cv::gpu::equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, Stream&
void cv::gpu::equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat& buf, Stream& s)
{
using namespace ::cv::gpu::device::hist;
CV_Assert(src.type() == CV_8UC1);
dst.create(src.size(), src.type());
@@ -1028,15 +1015,12 @@ void cv::gpu::equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat&
int intBufSize;
nppSafeCall( nppsIntegralGetBufferSize_32s(256, &intBufSize) );
int bufSize = static_cast<int>(std::max(256 * 240 * sizeof(int), intBufSize + 256 * sizeof(int)));
ensureSizeIsEnough(1, intBufSize + 256 * sizeof(int), CV_8UC1, buf);
ensureSizeIsEnough(1, bufSize, CV_8UC1, buf);
GpuMat histBuf(1, 256 * 240, CV_32SC1, buf.ptr());
GpuMat intBuf(1, intBufSize, CV_8UC1, buf.ptr());
GpuMat lut(1, 256, CV_32S, buf.ptr() + intBufSize);
calcHist(src, hist, histBuf, s);
calcHist(src, hist, s);
cudaStream_t stream = StreamAccessor::getStream(s);
@@ -1044,10 +1028,7 @@ void cv::gpu::equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat&
nppSafeCall( nppsIntegral_32s(hist.ptr<Npp32s>(), lut.ptr<Npp32s>(), 256, intBuf.ptr<Npp8u>()) );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
equalizeHist_gpu(src, dst, lut.ptr<int>(), stream);
hist::equalizeHist(src, dst, lut.ptr<int>(), stream);
}
////////////////////////////////////////////////////////////////////////
@@ -1448,35 +1429,31 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,
//////////////////////////////////////////////////////////////////////////////
// Canny
cv::gpu::CannyBuf::CannyBuf(const GpuMat& dx_, const GpuMat& dy_) : dx(dx_), dy(dy_)
cv::gpu::CannyBuf::CannyBuf(const GpuMat& dx_, const GpuMat& dy_)
{
CV_Assert(dx_.type() == CV_32SC1 && dy_.type() == CV_32SC1 && dx_.size() == dy_.size());
create(dx_.size(), -1);
(void) dx_;
(void) dy_;
}
void cv::gpu::CannyBuf::create(const Size& image_size, int apperture_size)
{
ensureSizeIsEnough(image_size, CV_32SC1, dx);
ensureSizeIsEnough(image_size, CV_32SC1, dy);
if (apperture_size > 0)
{
ensureSizeIsEnough(image_size, CV_32SC1, dx);
ensureSizeIsEnough(image_size, CV_32SC1, dy);
if (apperture_size == 3)
{
ensureSizeIsEnough(image_size, CV_32SC1, dx_buf);
ensureSizeIsEnough(image_size, CV_32SC1, dy_buf);
}
else if(apperture_size > 0)
{
if (!filterDX)
if (apperture_size != 3)
{
filterDX = createDerivFilter_GPU(CV_8UC1, CV_32S, 1, 0, apperture_size, BORDER_REPLICATE);
if (!filterDY)
filterDY = createDerivFilter_GPU(CV_8UC1, CV_32S, 0, 1, apperture_size, BORDER_REPLICATE);
}
}
ensureSizeIsEnough(image_size.height + 2, image_size.width + 2, CV_32FC1, edgeBuf);
ensureSizeIsEnough(image_size, CV_32FC1, edgeBuf);
ensureSizeIsEnough(image_size, CV_32SC1, dx_buf);
ensureSizeIsEnough(1, image_size.width * image_size.height, CV_16UC2, trackBuf1);
ensureSizeIsEnough(1, image_size.width * image_size.height, CV_16UC2, trackBuf2);
ensureSizeIsEnough(1, image_size.area(), CV_16UC2, trackBuf1);
ensureSizeIsEnough(1, image_size.area(), CV_16UC2, trackBuf2);
}
void cv::gpu::CannyBuf::release()
@@ -1490,91 +1467,86 @@ void cv::gpu::CannyBuf::release()
trackBuf2.release();
}
namespace cv { namespace gpu { namespace device
namespace canny
{
namespace canny
{
void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols);
void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad);
void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad);
void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad);
void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad);
void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh);
void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh);
void edgesHysteresisLocal(PtrStepSzi map, ushort2* st1);
void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols);
void edgesHysteresisGlobal(PtrStepSzi map, ushort2* st1, ushort2* st2);
void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols);
void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols);
}
}}}
void getEdges(PtrStepSzi map, PtrStepSzb dst);
}
namespace
{
void CannyCaller(CannyBuf& buf, GpuMat& dst, float low_thresh, float high_thresh)
void CannyCaller(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& dst, float low_thresh, float high_thresh)
{
using namespace ::cv::gpu::device::canny;
using namespace canny;
calcMap_gpu(buf.dx, buf.dy, buf.edgeBuf, buf.edgeBuf, dst.rows, dst.cols, low_thresh, high_thresh);
calcMap(dx, dy, buf.edgeBuf, buf.dx_buf, low_thresh, high_thresh);
edgesHysteresisLocal_gpu(buf.edgeBuf, buf.trackBuf1.ptr<ushort2>(), dst.rows, dst.cols);
edgesHysteresisLocal(buf.dx_buf, buf.trackBuf1.ptr<ushort2>());
edgesHysteresisGlobal_gpu(buf.edgeBuf, buf.trackBuf1.ptr<ushort2>(), buf.trackBuf2.ptr<ushort2>(), dst.rows, dst.cols);
edgesHysteresisGlobal(buf.dx_buf, buf.trackBuf1.ptr<ushort2>(), buf.trackBuf2.ptr<ushort2>());
getEdges_gpu(buf.edgeBuf, dst, dst.rows, dst.cols);
getEdges(buf.dx_buf, dst);
}
}
void cv::gpu::Canny(const GpuMat& src, GpuMat& dst, double low_thresh, double high_thresh, int apperture_size, bool L2gradient)
{
CannyBuf buf(src.size(), apperture_size);
CannyBuf buf;
Canny(src, buf, dst, low_thresh, high_thresh, apperture_size, L2gradient);
}
void cv::gpu::Canny(const GpuMat& src, CannyBuf& buf, GpuMat& dst, double low_thresh, double high_thresh, int apperture_size, bool L2gradient)
{
using namespace ::cv::gpu::device::canny;
using namespace canny;
CV_Assert(src.type() == CV_8UC1);
if (!TargetArchs::builtWith(SHARED_ATOMICS) || !DeviceInfo().supports(SHARED_ATOMICS))
if (!deviceSupports(SHARED_ATOMICS))
CV_Error(CV_StsNotImplemented, "The device doesn't support shared atomics");
if( low_thresh > high_thresh )
std::swap( low_thresh, high_thresh);
dst.create(src.size(), CV_8U);
dst.setTo(Scalar::all(0));
buf.create(src.size(), apperture_size);
buf.edgeBuf.setTo(Scalar::all(0));
if (apperture_size == 3)
{
calcSobelRowPass_gpu(src, buf.dx_buf, buf.dy_buf, src.rows, src.cols);
Size wholeSize;
Point ofs;
src.locateROI(wholeSize, ofs);
GpuMat srcWhole(wholeSize, src.type(), src.datastart, src.step);
calcMagnitude_gpu(buf.dx_buf, buf.dy_buf, buf.dx, buf.dy, buf.edgeBuf, src.rows, src.cols, L2gradient);
calcMagnitude(srcWhole, ofs.x, ofs.y, buf.dx, buf.dy, buf.edgeBuf, L2gradient);
}
else
{
buf.filterDX->apply(src, buf.dx, Rect(0, 0, src.cols, src.rows));
buf.filterDY->apply(src, buf.dy, Rect(0, 0, src.cols, src.rows));
calcMagnitude_gpu(buf.dx, buf.dy, buf.edgeBuf, src.rows, src.cols, L2gradient);
calcMagnitude(buf.dx, buf.dy, buf.edgeBuf, L2gradient);
}
CannyCaller(buf, dst, static_cast<float>(low_thresh), static_cast<float>(high_thresh));
CannyCaller(buf.dx, buf.dy, buf, dst, static_cast<float>(low_thresh), static_cast<float>(high_thresh));
}
void cv::gpu::Canny(const GpuMat& dx, const GpuMat& dy, GpuMat& dst, double low_thresh, double high_thresh, bool L2gradient)
{
CannyBuf buf(dx, dy);
CannyBuf buf;
Canny(dx, dy, buf, dst, low_thresh, high_thresh, L2gradient);
}
void cv::gpu::Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& dst, double low_thresh, double high_thresh, bool L2gradient)
{
using namespace ::cv::gpu::device::canny;
using namespace canny;
CV_Assert(TargetArchs::builtWith(SHARED_ATOMICS) && DeviceInfo().supports(SHARED_ATOMICS));
CV_Assert(dx.type() == CV_32SC1 && dy.type() == CV_32SC1 && dx.size() == dy.size());
@@ -1583,17 +1555,11 @@ void cv::gpu::Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& d
std::swap( low_thresh, high_thresh);
dst.create(dx.size(), CV_8U);
dst.setTo(Scalar::all(0));
buf.dx = dx; buf.dy = dy;
buf.create(dx.size(), -1);
buf.edgeBuf.setTo(Scalar::all(0));
calcMagnitude_gpu(dx, dy, buf.edgeBuf, dx.rows, dx.cols, L2gradient);
calcMagnitude(dx, dy, buf.edgeBuf, L2gradient);
CannyCaller(buf, dst, static_cast<float>(low_thresh), static_cast<float>(high_thresh));
CannyCaller(dx, dy, buf, dst, static_cast<float>(low_thresh), static_cast<float>(high_thresh));
}
#endif /* !defined (HAVE_CUDA) */

View File

@@ -118,7 +118,7 @@ void cv::gpu::meanStdDev(const GpuMat& src, Scalar& mean, Scalar& stddev, GpuMat
{
CV_Assert(src.type() == CV_8UC1);
if (!TargetArchs::builtWith(FEATURE_SET_COMPUTE_13) || !DeviceInfo().supports(FEATURE_SET_COMPUTE_13))
if (!deviceSupports(FEATURE_SET_COMPUTE_13))
CV_Error(CV_StsNotImplemented, "Not sufficient compute capebility");
NppiSize sz;
@@ -204,34 +204,19 @@ double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType)
////////////////////////////////////////////////////////////////////////
// Sum
namespace cv { namespace gpu { namespace device
namespace sum
{
namespace matrix_reductions
{
namespace sum
{
template <typename T>
void sumCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn);
void getBufSize(int cols, int rows, int cn, int& bufcols, int& bufrows);
template <typename T>
void sumMultipassCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn);
template <typename T, int cn>
void run(PtrStepSzb src, void* buf, double* sum);
template <typename T>
void absSumCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn);
template <typename T, int cn>
void runAbs(PtrStepSzb src, void* buf, double* sum);
template <typename T>
void absSumMultipassCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn);
template <typename T>
void sqrSumCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn);
template <typename T>
void sqrSumMultipassCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn);
void getBufSizeRequired(int cols, int rows, int cn, int& bufcols, int& bufrows);
}
}
}}}
template <typename T, int cn>
void runSqr(PtrStepSzb src, void* buf, double* sum);
}
Scalar cv::gpu::sum(const GpuMat& src)
{
@@ -239,159 +224,127 @@ Scalar cv::gpu::sum(const GpuMat& src)
return sum(src, buf);
}
Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf)
{
using namespace cv::gpu::device::matrix_reductions::sum;
typedef void (*Caller)(const PtrStepSzb, PtrStepb, double*, int);
static Caller multipass_callers[] =
typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum);
static const func_t funcs[7][5] =
{
sumMultipassCaller<unsigned char>, sumMultipassCaller<char>,
sumMultipassCaller<unsigned short>, sumMultipassCaller<short>,
sumMultipassCaller<int>, sumMultipassCaller<float>
{0, ::sum::run<uchar , 1>, ::sum::run<uchar , 2>, ::sum::run<uchar , 3>, ::sum::run<uchar , 4>},
{0, ::sum::run<schar , 1>, ::sum::run<schar , 2>, ::sum::run<schar , 3>, ::sum::run<schar , 4>},
{0, ::sum::run<ushort, 1>, ::sum::run<ushort, 2>, ::sum::run<ushort, 3>, ::sum::run<ushort, 4>},
{0, ::sum::run<short , 1>, ::sum::run<short , 2>, ::sum::run<short , 3>, ::sum::run<short , 4>},
{0, ::sum::run<int , 1>, ::sum::run<int , 2>, ::sum::run<int , 3>, ::sum::run<int , 4>},
{0, ::sum::run<float , 1>, ::sum::run<float , 2>, ::sum::run<float , 3>, ::sum::run<float , 4>},
{0, ::sum::run<double, 1>, ::sum::run<double, 2>, ::sum::run<double, 3>, ::sum::run<double, 4>}
};
static Caller singlepass_callers[] = {
sumCaller<unsigned char>, sumCaller<char>,
sumCaller<unsigned short>, sumCaller<short>,
sumCaller<int>, sumCaller<float>
};
CV_Assert(src.depth() <= CV_32F);
if (src.depth() == CV_64F)
{
if (!deviceSupports(NATIVE_DOUBLE))
CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
}
Size buf_size;
getBufSizeRequired(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height);
::sum::getBufSize(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height);
ensureSizeIsEnough(buf_size, CV_8U, buf);
buf.setTo(Scalar::all(0));
Caller* callers = multipass_callers;
if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS))
callers = singlepass_callers;
Caller caller = callers[src.depth()];
const func_t func = funcs[src.depth()][src.channels()];
double result[4];
caller(src, buf, result, src.channels());
func(src, buf.data, result);
return Scalar(result[0], result[1], result[2], result[3]);
}
Scalar cv::gpu::absSum(const GpuMat& src)
{
GpuMat buf;
return absSum(src, buf);
}
Scalar cv::gpu::absSum(const GpuMat& src, GpuMat& buf)
{
using namespace cv::gpu::device::matrix_reductions::sum;
typedef void (*Caller)(const PtrStepSzb, PtrStepb, double*, int);
static Caller multipass_callers[] =
typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum);
static const func_t funcs[7][5] =
{
absSumMultipassCaller<unsigned char>, absSumMultipassCaller<char>,
absSumMultipassCaller<unsigned short>, absSumMultipassCaller<short>,
absSumMultipassCaller<int>, absSumMultipassCaller<float>
{0, ::sum::runAbs<uchar , 1>, ::sum::runAbs<uchar , 2>, ::sum::runAbs<uchar , 3>, ::sum::runAbs<uchar , 4>},
{0, ::sum::runAbs<schar , 1>, ::sum::runAbs<schar , 2>, ::sum::runAbs<schar , 3>, ::sum::runAbs<schar , 4>},
{0, ::sum::runAbs<ushort, 1>, ::sum::runAbs<ushort, 2>, ::sum::runAbs<ushort, 3>, ::sum::runAbs<ushort, 4>},
{0, ::sum::runAbs<short , 1>, ::sum::runAbs<short , 2>, ::sum::runAbs<short , 3>, ::sum::runAbs<short , 4>},
{0, ::sum::runAbs<int , 1>, ::sum::runAbs<int , 2>, ::sum::runAbs<int , 3>, ::sum::runAbs<int , 4>},
{0, ::sum::runAbs<float , 1>, ::sum::runAbs<float , 2>, ::sum::runAbs<float , 3>, ::sum::runAbs<float , 4>},
{0, ::sum::runAbs<double, 1>, ::sum::runAbs<double, 2>, ::sum::runAbs<double, 3>, ::sum::runAbs<double, 4>}
};
static Caller singlepass_callers[] =
if (src.depth() == CV_64F)
{
absSumCaller<unsigned char>, absSumCaller<char>,
absSumCaller<unsigned short>, absSumCaller<short>,
absSumCaller<int>, absSumCaller<float>
};
CV_Assert(src.depth() <= CV_32F);
if (!deviceSupports(NATIVE_DOUBLE))
CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
}
Size buf_size;
getBufSizeRequired(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height);
::sum::getBufSize(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height);
ensureSizeIsEnough(buf_size, CV_8U, buf);
buf.setTo(Scalar::all(0));
Caller* callers = multipass_callers;
if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS))
callers = singlepass_callers;
Caller caller = callers[src.depth()];
const func_t func = funcs[src.depth()][src.channels()];
double result[4];
caller(src, buf, result, src.channels());
func(src, buf.data, result);
return Scalar(result[0], result[1], result[2], result[3]);
}
Scalar cv::gpu::sqrSum(const GpuMat& src)
{
GpuMat buf;
return sqrSum(src, buf);
}
Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf)
{
using namespace cv::gpu::device::matrix_reductions::sum;
typedef void (*Caller)(const PtrStepSzb, PtrStepb, double*, int);
static Caller multipass_callers[] =
typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum);
static const func_t funcs[7][5] =
{
sqrSumMultipassCaller<unsigned char>, sqrSumMultipassCaller<char>,
sqrSumMultipassCaller<unsigned short>, sqrSumMultipassCaller<short>,
sqrSumMultipassCaller<int>, sqrSumMultipassCaller<float>
{0, ::sum::runSqr<uchar , 1>, ::sum::runSqr<uchar , 2>, ::sum::runSqr<uchar , 3>, ::sum::runSqr<uchar , 4>},
{0, ::sum::runSqr<schar , 1>, ::sum::runSqr<schar , 2>, ::sum::runSqr<schar , 3>, ::sum::runSqr<schar , 4>},
{0, ::sum::runSqr<ushort, 1>, ::sum::runSqr<ushort, 2>, ::sum::runSqr<ushort, 3>, ::sum::runSqr<ushort, 4>},
{0, ::sum::runSqr<short , 1>, ::sum::runSqr<short , 2>, ::sum::runSqr<short , 3>, ::sum::runSqr<short , 4>},
{0, ::sum::runSqr<int , 1>, ::sum::runSqr<int , 2>, ::sum::runSqr<int , 3>, ::sum::runSqr<int , 4>},
{0, ::sum::runSqr<float , 1>, ::sum::runSqr<float , 2>, ::sum::runSqr<float , 3>, ::sum::runSqr<float , 4>},
{0, ::sum::runSqr<double, 1>, ::sum::runSqr<double, 2>, ::sum::runSqr<double, 3>, ::sum::runSqr<double, 4>}
};
static Caller singlepass_callers[7] =
if (src.depth() == CV_64F)
{
sqrSumCaller<unsigned char>, sqrSumCaller<char>,
sqrSumCaller<unsigned short>, sqrSumCaller<short>,
sqrSumCaller<int>, sqrSumCaller<float>
};
CV_Assert(src.depth() <= CV_32F);
Caller* callers = multipass_callers;
if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS))
callers = singlepass_callers;
if (!deviceSupports(NATIVE_DOUBLE))
CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
}
Size buf_size;
getBufSizeRequired(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height);
::sum::getBufSize(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height);
ensureSizeIsEnough(buf_size, CV_8U, buf);
buf.setTo(Scalar::all(0));
Caller caller = callers[src.depth()];
const func_t func = funcs[src.depth()][src.channels()];
double result[4];
caller(src, buf, result, src.channels());
func(src, buf.data, result);
return Scalar(result[0], result[1], result[2], result[3]);
}
////////////////////////////////////////////////////////////////////////
// Find min or max
// minMax
namespace cv { namespace gpu { namespace device
namespace minMax
{
namespace matrix_reductions
{
namespace minmax
{
void getBufSizeRequired(int cols, int rows, int elem_size, int& bufcols, int& bufrows);
template <typename T>
void minMaxCaller(const PtrStepSzb src, double* minval, double* maxval, PtrStepb buf);
template <typename T>
void minMaxMaskCaller(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
template <typename T>
void minMaxMultipassCaller(const PtrStepSzb src, double* minval, double* maxval, PtrStepb buf);
template <typename T>
void minMaxMaskMultipassCaller(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
}
}
}}}
void getBufSize(int cols, int rows, int& bufcols, int& bufrows);
template <typename T>
void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
}
void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask)
{
@@ -399,112 +352,49 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
minMax(src, minVal, maxVal, mask, buf);
}
void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask, GpuMat& buf)
{
using namespace ::cv::gpu::device::matrix_reductions::minmax;
typedef void (*Caller)(const PtrStepSzb, double*, double*, PtrStepb);
typedef void (*MaskedCaller)(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb);
static Caller multipass_callers[] =
typedef void (*func_t)(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
static const func_t funcs[] =
{
minMaxMultipassCaller<unsigned char>, minMaxMultipassCaller<char>,
minMaxMultipassCaller<unsigned short>, minMaxMultipassCaller<short>,
minMaxMultipassCaller<int>, minMaxMultipassCaller<float>, 0
::minMax::run<uchar>,
::minMax::run<schar>,
::minMax::run<ushort>,
::minMax::run<short>,
::minMax::run<int>,
::minMax::run<float>,
::minMax::run<double>
};
static Caller singlepass_callers[] =
{
minMaxCaller<unsigned char>, minMaxCaller<char>,
minMaxCaller<unsigned short>, minMaxCaller<short>,
minMaxCaller<int>, minMaxCaller<float>, minMaxCaller<double>
};
static MaskedCaller masked_multipass_callers[] =
{
minMaxMaskMultipassCaller<unsigned char>, minMaxMaskMultipassCaller<char>,
minMaxMaskMultipassCaller<unsigned short>, minMaxMaskMultipassCaller<short>,
minMaxMaskMultipassCaller<int>, minMaxMaskMultipassCaller<float>, 0
};
static MaskedCaller masked_singlepass_callers[] =
{
minMaxMaskCaller<unsigned char>, minMaxMaskCaller<char>,
minMaxMaskCaller<unsigned short>, minMaxMaskCaller<short>,
minMaxMaskCaller<int>, minMaxMaskCaller<float>, minMaxMaskCaller<double>
};
CV_Assert(src.depth() <= CV_64F);
CV_Assert(src.channels() == 1);
CV_Assert(mask.empty() || (mask.type() == CV_8U && src.size() == mask.size()));
CV_Assert( src.channels() == 1 );
CV_Assert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) );
if (src.depth() == CV_64F)
{
if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
if (!deviceSupports(NATIVE_DOUBLE))
CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
}
double minVal_; if (!minVal) minVal = &minVal_;
double maxVal_; if (!maxVal) maxVal = &maxVal_;
Size buf_size;
getBufSizeRequired(src.cols, src.rows, static_cast<int>(src.elemSize()), buf_size.width, buf_size.height);
::minMax::getBufSize(src.cols, src.rows, buf_size.width, buf_size.height);
ensureSizeIsEnough(buf_size, CV_8U, buf);
if (mask.empty())
{
Caller* callers = multipass_callers;
if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS))
callers = singlepass_callers;
const func_t func = funcs[src.depth()];
Caller caller = callers[src.type()];
CV_Assert(caller != 0);
caller(src, minVal, maxVal, buf);
}
else
{
MaskedCaller* callers = masked_multipass_callers;
if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS))
callers = masked_singlepass_callers;
MaskedCaller caller = callers[src.type()];
CV_Assert(caller != 0);
caller(src, mask, minVal, maxVal, buf);
}
double temp1, temp2;
func(src, mask, minVal ? minVal : &temp1, maxVal ? maxVal : &temp2, buf);
}
////////////////////////////////////////////////////////////////////////
// Locate min and max
// minMaxLoc
namespace cv { namespace gpu { namespace device
namespace minMaxLoc
{
namespace matrix_reductions
{
namespace minmaxloc
{
void getBufSizeRequired(int cols, int rows, int elem_size, int& b1cols,
int& b1rows, int& b2cols, int& b2rows);
void getBufSize(int cols, int rows, size_t elem_size, int& b1cols, int& b1rows, int& b2cols, int& b2rows);
template <typename T>
void minMaxLocCaller(const PtrStepSzb src, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);
template <typename T>
void minMaxLocMaskCaller(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);
template <typename T>
void minMaxLocMultipassCaller(const PtrStepSzb src, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);
template <typename T>
void minMaxLocMaskMultipassCaller(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);
}
}
}}}
template <typename T>
void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
}
void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, const GpuMat& mask)
{
@@ -515,104 +405,49 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc,
const GpuMat& mask, GpuMat& valBuf, GpuMat& locBuf)
{
using namespace ::cv::gpu::device::matrix_reductions::minmaxloc;
typedef void (*Caller)(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
typedef void (*MaskedCaller)(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
static Caller multipass_callers[] =
typedef void (*func_t)(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
static const func_t funcs[] =
{
minMaxLocMultipassCaller<unsigned char>, minMaxLocMultipassCaller<char>,
minMaxLocMultipassCaller<unsigned short>, minMaxLocMultipassCaller<short>,
minMaxLocMultipassCaller<int>, minMaxLocMultipassCaller<float>, 0
::minMaxLoc::run<uchar>,
::minMaxLoc::run<schar>,
::minMaxLoc::run<ushort>,
::minMaxLoc::run<short>,
::minMaxLoc::run<int>,
::minMaxLoc::run<float>,
::minMaxLoc::run<double>
};
static Caller singlepass_callers[] =
{
minMaxLocCaller<unsigned char>, minMaxLocCaller<char>,
minMaxLocCaller<unsigned short>, minMaxLocCaller<short>,
minMaxLocCaller<int>, minMaxLocCaller<float>, minMaxLocCaller<double>
};
static MaskedCaller masked_multipass_callers[] =
{
minMaxLocMaskMultipassCaller<unsigned char>, minMaxLocMaskMultipassCaller<char>,
minMaxLocMaskMultipassCaller<unsigned short>, minMaxLocMaskMultipassCaller<short>,
minMaxLocMaskMultipassCaller<int>, minMaxLocMaskMultipassCaller<float>, 0
};
static MaskedCaller masked_singlepass_callers[] =
{
minMaxLocMaskCaller<unsigned char>, minMaxLocMaskCaller<char>,
minMaxLocMaskCaller<unsigned short>, minMaxLocMaskCaller<short>,
minMaxLocMaskCaller<int>, minMaxLocMaskCaller<float>, minMaxLocMaskCaller<double>
};
CV_Assert(src.depth() <= CV_64F);
CV_Assert(src.channels() == 1);
CV_Assert(mask.empty() || (mask.type() == CV_8U && src.size() == mask.size()));
CV_Assert( src.channels() == 1 );
CV_Assert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) );
if (src.depth() == CV_64F)
{
if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
if (!deviceSupports(NATIVE_DOUBLE))
CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
}
double minVal_; if (!minVal) minVal = &minVal_;
double maxVal_; if (!maxVal) maxVal = &maxVal_;
int minLoc_[2];
int maxLoc_[2];
Size valbuf_size, locbuf_size;
getBufSizeRequired(src.cols, src.rows, static_cast<int>(src.elemSize()), valbuf_size.width,
valbuf_size.height, locbuf_size.width, locbuf_size.height);
::minMaxLoc::getBufSize(src.cols, src.rows, src.elemSize(), valbuf_size.width, valbuf_size.height, locbuf_size.width, locbuf_size.height);
ensureSizeIsEnough(valbuf_size, CV_8U, valBuf);
ensureSizeIsEnough(locbuf_size, CV_8U, locBuf);
if (mask.empty())
{
Caller* callers = multipass_callers;
if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS))
callers = singlepass_callers;
const func_t func = funcs[src.depth()];
Caller caller = callers[src.type()];
CV_Assert(caller != 0);
caller(src, minVal, maxVal, minLoc_, maxLoc_, valBuf, locBuf);
}
else
{
MaskedCaller* callers = masked_multipass_callers;
if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS))
callers = masked_singlepass_callers;
MaskedCaller caller = callers[src.type()];
CV_Assert(caller != 0);
caller(src, mask, minVal, maxVal, minLoc_, maxLoc_, valBuf, locBuf);
}
if (minLoc) { minLoc->x = minLoc_[0]; minLoc->y = minLoc_[1]; }
if (maxLoc) { maxLoc->x = maxLoc_[0]; maxLoc->y = maxLoc_[1]; }
double temp1, temp2;
Point temp3, temp4;
func(src, mask, minVal ? minVal : &temp1, maxVal ? maxVal : &temp2, minLoc ? &minLoc->x : &temp3.x, maxLoc ? &maxLoc->x : &temp4.x, valBuf, locBuf);
}
//////////////////////////////////////////////////////////////////////////////
// Count non-zero elements
// countNonZero
namespace cv { namespace gpu { namespace device
namespace countNonZero
{
namespace matrix_reductions
{
namespace countnonzero
{
void getBufSizeRequired(int cols, int rows, int& bufcols, int& bufrows);
void getBufSize(int cols, int rows, int& bufcols, int& bufrows);
template <typename T>
int countNonZeroCaller(const PtrStepSzb src, PtrStepb buf);
template <typename T>
int countNonZeroMultipassCaller(const PtrStepSzb src, PtrStepb buf);
}
}
}}}
template <typename T>
int run(const PtrStepSzb src, PtrStep<unsigned int> buf);
}
int cv::gpu::countNonZero(const GpuMat& src)
{
@@ -620,198 +455,213 @@ int cv::gpu::countNonZero(const GpuMat& src)
return countNonZero(src, buf);
}
int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)
{
using namespace ::cv::gpu::device::matrix_reductions::countnonzero;
typedef int (*Caller)(const PtrStepSzb src, PtrStepb buf);
static Caller multipass_callers[7] =
typedef int (*func_t)(const PtrStepSzb src, PtrStep<unsigned int> buf);
static const func_t funcs[] =
{
countNonZeroMultipassCaller<unsigned char>, countNonZeroMultipassCaller<char>,
countNonZeroMultipassCaller<unsigned short>, countNonZeroMultipassCaller<short>,
countNonZeroMultipassCaller<int>, countNonZeroMultipassCaller<float>, 0
::countNonZero::run<uchar>,
::countNonZero::run<schar>,
::countNonZero::run<ushort>,
::countNonZero::run<short>,
::countNonZero::run<int>,
::countNonZero::run<float>,
::countNonZero::run<double>
};
static Caller singlepass_callers[7] =
{
countNonZeroCaller<unsigned char>, countNonZeroCaller<char>,
countNonZeroCaller<unsigned short>, countNonZeroCaller<short>,
countNonZeroCaller<int>, countNonZeroCaller<float>, countNonZeroCaller<double> };
CV_Assert(src.depth() <= CV_64F);
CV_Assert(src.channels() == 1);
if (src.depth() == CV_64F)
{
if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
if (!deviceSupports(NATIVE_DOUBLE))
CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
}
Size buf_size;
getBufSizeRequired(src.cols, src.rows, buf_size.width, buf_size.height);
::countNonZero::getBufSize(src.cols, src.rows, buf_size.width, buf_size.height);
ensureSizeIsEnough(buf_size, CV_8U, buf);
Caller* callers = multipass_callers;
if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS))
callers = singlepass_callers;
const func_t func = funcs[src.depth()];
Caller caller = callers[src.type()];
CV_Assert(caller != 0);
return caller(src, buf);
return func(src, buf);
}
//////////////////////////////////////////////////////////////////////////////
// reduce
namespace cv { namespace gpu { namespace device
namespace reduce
{
namespace matrix_reductions
{
template <typename T, typename S, typename D> void reduceRows_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
template <typename T, typename S, typename D> void reduceCols_gpu(const PtrStepSzb& src, int cn, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
}
}}}
template <typename T, typename S, typename D>
void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
template <typename T, typename S, typename D>
void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
}
void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int dtype, Stream& stream)
{
using namespace ::cv::gpu::device::matrix_reductions;
CV_Assert(src.depth() <= CV_32F && src.channels() <= 4 && dtype <= CV_32F);
CV_Assert(dim == 0 || dim == 1);
CV_Assert(reduceOp == CV_REDUCE_SUM || reduceOp == CV_REDUCE_AVG || reduceOp == CV_REDUCE_MAX || reduceOp == CV_REDUCE_MIN);
CV_Assert( src.channels() <= 4 );
CV_Assert( dim == 0 || dim == 1 );
CV_Assert( reduceOp == CV_REDUCE_SUM || reduceOp == CV_REDUCE_AVG || reduceOp == CV_REDUCE_MAX || reduceOp == CV_REDUCE_MIN );
if (dtype < 0)
dtype = src.depth();
dst.create(1, dim == 0 ? src.cols : src.rows, CV_MAKETYPE(dtype, src.channels()));
dst.create(1, dim == 0 ? src.cols : src.rows, CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels()));
if (dim == 0)
{
typedef void (*caller_t)(const PtrStepSzb& src, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
static const caller_t callers[6][6] =
typedef void (*func_t)(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
static const func_t funcs[7][7] =
{
{
reduceRows_gpu<unsigned char, int, unsigned char>,
0/*reduceRows_gpu<unsigned char, int, signed char>*/,
0/*reduceRows_gpu<unsigned char, int, unsigned short>*/,
0/*reduceRows_gpu<unsigned char, int, short>*/,
reduceRows_gpu<unsigned char, int, int>,
reduceRows_gpu<unsigned char, int, float>
::reduce::rows<unsigned char, int, unsigned char>,
0/*::reduce::rows<unsigned char, int, signed char>*/,
0/*::reduce::rows<unsigned char, int, unsigned short>*/,
0/*::reduce::rows<unsigned char, int, short>*/,
::reduce::rows<unsigned char, int, int>,
::reduce::rows<unsigned char, float, float>,
::reduce::rows<unsigned char, double, double>
},
{
0/*reduceRows_gpu<signed char, int, unsigned char>*/,
0/*reduceRows_gpu<signed char, int, signed char>*/,
0/*reduceRows_gpu<signed char, int, unsigned short>*/,
0/*reduceRows_gpu<signed char, int, short>*/,
0/*reduceRows_gpu<signed char, int, int>*/,
0/*reduceRows_gpu<signed char, int, float>*/
0/*::reduce::rows<signed char, int, unsigned char>*/,
0/*::reduce::rows<signed char, int, signed char>*/,
0/*::reduce::rows<signed char, int, unsigned short>*/,
0/*::reduce::rows<signed char, int, short>*/,
0/*::reduce::rows<signed char, int, int>*/,
0/*::reduce::rows<signed char, float, float>*/,
0/*::reduce::rows<signed char, double, double>*/
},
{
0/*reduceRows_gpu<unsigned short, int, unsigned char>*/,
0/*reduceRows_gpu<unsigned short, int, signed char>*/,
reduceRows_gpu<unsigned short, int, unsigned short>,
0/*reduceRows_gpu<unsigned short, int, short>*/,
reduceRows_gpu<unsigned short, int, int>,
reduceRows_gpu<unsigned short, int, float>
0/*::reduce::rows<unsigned short, int, unsigned char>*/,
0/*::reduce::rows<unsigned short, int, signed char>*/,
::reduce::rows<unsigned short, int, unsigned short>,
0/*::reduce::rows<unsigned short, int, short>*/,
::reduce::rows<unsigned short, int, int>,
::reduce::rows<unsigned short, float, float>,
::reduce::rows<unsigned short, double, double>
},
{
0/*reduceRows_gpu<short, int, unsigned char>*/,
0/*reduceRows_gpu<short, int, signed char>*/,
0/*reduceRows_gpu<short, int, unsigned short>*/,
reduceRows_gpu<short, int, short>,
reduceRows_gpu<short, int, int>,
reduceRows_gpu<short, int, float>
0/*::reduce::rows<short, int, unsigned char>*/,
0/*::reduce::rows<short, int, signed char>*/,
0/*::reduce::rows<short, int, unsigned short>*/,
::reduce::rows<short, int, short>,
::reduce::rows<short, int, int>,
::reduce::rows<short, float, float>,
::reduce::rows<short, double, double>
},
{
0/*reduceRows_gpu<int, int, unsigned char>*/,
0/*reduceRows_gpu<int, int, signed char>*/,
0/*reduceRows_gpu<int, int, unsigned short>*/,
0/*reduceRows_gpu<int, int, short>*/,
reduceRows_gpu<int, int, int>,
reduceRows_gpu<int, int, float>
0/*::reduce::rows<int, int, unsigned char>*/,
0/*::reduce::rows<int, int, signed char>*/,
0/*::reduce::rows<int, int, unsigned short>*/,
0/*::reduce::rows<int, int, short>*/,
::reduce::rows<int, int, int>,
::reduce::rows<int, float, float>,
::reduce::rows<int, double, double>
},
{
0/*reduceRows_gpu<float, float, unsigned char>*/,
0/*reduceRows_gpu<float, float, signed char>*/,
0/*reduceRows_gpu<float, float, unsigned short>*/,
0/*reduceRows_gpu<float, float, short>*/,
0/*reduceRows_gpu<float, float, int>*/,
reduceRows_gpu<float, float, float>
0/*::reduce::rows<float, float, unsigned char>*/,
0/*::reduce::rows<float, float, signed char>*/,
0/*::reduce::rows<float, float, unsigned short>*/,
0/*::reduce::rows<float, float, short>*/,
0/*::reduce::rows<float, float, int>*/,
::reduce::rows<float, float, float>,
::reduce::rows<float, double, double>
},
{
0/*::reduce::rows<double, double, unsigned char>*/,
0/*::reduce::rows<double, double, signed char>*/,
0/*::reduce::rows<double, double, unsigned short>*/,
0/*::reduce::rows<double, double, short>*/,
0/*::reduce::rows<double, double, int>*/,
0/*::reduce::rows<double, double, float>*/,
::reduce::rows<double, double, double>
}
};
const caller_t func = callers[src.depth()][dst.depth()];
const func_t func = funcs[src.depth()][dst.depth()];
if (!func)
CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of input and output array formats");
func(src.reshape(1), dst.reshape(1), reduceOp, StreamAccessor::getStream(stream));
func(src.reshape(1), dst.data, reduceOp, StreamAccessor::getStream(stream));
}
else
{
typedef void (*caller_t)(const PtrStepSzb& src, int cn, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
static const caller_t callers[6][6] =
typedef void (*func_t)(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
static const func_t funcs[7][7] =
{
{
reduceCols_gpu<unsigned char, int, unsigned char>,
0/*reduceCols_gpu<unsigned char, int, signed char>*/,
0/*reduceCols_gpu<unsigned char, int, unsigned short>*/,
0/*reduceCols_gpu<unsigned char, int, short>*/,
reduceCols_gpu<unsigned char, int, int>,
reduceCols_gpu<unsigned char, int, float>
::reduce::cols<unsigned char, int, unsigned char>,
0/*::reduce::cols<unsigned char, int, signed char>*/,
0/*::reduce::cols<unsigned char, int, unsigned short>*/,
0/*::reduce::cols<unsigned char, int, short>*/,
::reduce::cols<unsigned char, int, int>,
::reduce::cols<unsigned char, float, float>,
::reduce::cols<unsigned char, double, double>
},
{
0/*reduceCols_gpu<signed char, int, unsigned char>*/,
0/*reduceCols_gpu<signed char, int, signed char>*/,
0/*reduceCols_gpu<signed char, int, unsigned short>*/,
0/*reduceCols_gpu<signed char, int, short>*/,
0/*reduceCols_gpu<signed char, int, int>*/,
0/*reduceCols_gpu<signed char, int, float>*/
0/*::reduce::cols<signed char, int, unsigned char>*/,
0/*::reduce::cols<signed char, int, signed char>*/,
0/*::reduce::cols<signed char, int, unsigned short>*/,
0/*::reduce::cols<signed char, int, short>*/,
0/*::reduce::cols<signed char, int, int>*/,
0/*::reduce::cols<signed char, float, float>*/,
0/*::reduce::cols<signed char, double, double>*/
},
{
0/*reduceCols_gpu<unsigned short, int, unsigned char>*/,
0/*reduceCols_gpu<unsigned short, int, signed char>*/,
reduceCols_gpu<unsigned short, int, unsigned short>,
0/*reduceCols_gpu<unsigned short, int, short>*/,
reduceCols_gpu<unsigned short, int, int>,
reduceCols_gpu<unsigned short, int, float>
0/*::reduce::cols<unsigned short, int, unsigned char>*/,
0/*::reduce::cols<unsigned short, int, signed char>*/,
::reduce::cols<unsigned short, int, unsigned short>,
0/*::reduce::cols<unsigned short, int, short>*/,
::reduce::cols<unsigned short, int, int>,
::reduce::cols<unsigned short, float, float>,
::reduce::cols<unsigned short, double, double>
},
{
0/*reduceCols_gpu<short, int, unsigned char>*/,
0/*reduceCols_gpu<short, int, signed char>*/,
0/*reduceCols_gpu<short, int, unsigned short>*/,
reduceCols_gpu<short, int, short>,
reduceCols_gpu<short, int, int>,
reduceCols_gpu<short, int, float>
0/*::reduce::cols<short, int, unsigned char>*/,
0/*::reduce::cols<short, int, signed char>*/,
0/*::reduce::cols<short, int, unsigned short>*/,
::reduce::cols<short, int, short>,
::reduce::cols<short, int, int>,
::reduce::cols<short, float, float>,
::reduce::cols<short, double, double>
},
{
0/*reduceCols_gpu<int, int, unsigned char>*/,
0/*reduceCols_gpu<int, int, signed char>*/,
0/*reduceCols_gpu<int, int, unsigned short>*/,
0/*reduceCols_gpu<int, int, short>*/,
reduceCols_gpu<int, int, int>,
reduceCols_gpu<int, int, float>
0/*::reduce::cols<int, int, unsigned char>*/,
0/*::reduce::cols<int, int, signed char>*/,
0/*::reduce::cols<int, int, unsigned short>*/,
0/*::reduce::cols<int, int, short>*/,
::reduce::cols<int, int, int>,
::reduce::cols<int, float, float>,
::reduce::cols<int, double, double>
},
{
0/*reduceCols_gpu<float, unsigned char>*/,
0/*reduceCols_gpu<float, signed char>*/,
0/*reduceCols_gpu<float, unsigned short>*/,
0/*reduceCols_gpu<float, short>*/,
0/*reduceCols_gpu<float, int>*/,
reduceCols_gpu<float, float, float>
0/*::reduce::cols<float, float, unsigned char>*/,
0/*::reduce::cols<float, float, signed char>*/,
0/*::reduce::cols<float, float, unsigned short>*/,
0/*::reduce::cols<float, float, short>*/,
0/*::reduce::cols<float, float, int>*/,
::reduce::cols<float, float, float>,
::reduce::cols<float, double, double>
},
{
0/*::reduce::cols<double, double, unsigned char>*/,
0/*::reduce::cols<double, double, signed char>*/,
0/*::reduce::cols<double, double, unsigned short>*/,
0/*::reduce::cols<double, double, short>*/,
0/*::reduce::cols<double, double, int>*/,
0/*::reduce::cols<double, double, float>*/,
::reduce::cols<double, double, double>
}
};
const caller_t func = callers[src.depth()][dst.depth()];
const func_t func = funcs[src.depth()][dst.depth()];
if (!func)
CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of input and output array formats");
func(src, src.channels(), dst, reduceOp, StreamAccessor::getStream(stream));
func(src, dst.data, src.channels(), reduceOp, StreamAccessor::getStream(stream));
}
}

View File

@@ -65,6 +65,8 @@
#include "NPP_staging/NPP_staging.hpp"
#include "NCVRuntimeTemplates.hpp"
#include "NCVHaarObjectDetection.hpp"
#include "opencv2/gpu/device/warp.hpp"
#include "opencv2/gpu/device/warp_shuffle.hpp"
//==============================================================================
@@ -81,6 +83,20 @@ NCV_CT_ASSERT(K_WARP_SIZE == 32); //this is required for the manual unroll of th
//assuming size <= WARP_SIZE and size is power of 2
__device__ Ncv32u warpScanInclusive(Ncv32u idata, volatile Ncv32u *s_Data)
{
#if __CUDA_ARCH__ >= 300
const unsigned int laneId = cv::gpu::device::Warp::laneId();
// scan on shuffl functions
#pragma unroll
for (int i = 1; i <= (K_WARP_SIZE / 2); i *= 2)
{
const Ncv32u n = cv::gpu::device::shfl_up(idata, i);
if (laneId >= i)
idata += n;
}
return idata;
#else
Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1));
s_Data[pos] = 0;
pos += K_WARP_SIZE;
@@ -93,6 +109,7 @@ __device__ Ncv32u warpScanInclusive(Ncv32u idata, volatile Ncv32u *s_Data)
s_Data[pos] += s_Data[pos - 16];
return s_Data[pos];
#endif
}
__device__ __forceinline__ Ncv32u warpScanExclusive(Ncv32u idata, volatile Ncv32u *s_Data)
@@ -2317,4 +2334,4 @@ NCVStatus ncvHaarStoreNVBIN_host(const std::string &filename,
return NCV_SUCCESS;
}
#endif /* CUDA_DISABLER */
#endif /* CUDA_DISABLER */

View File

@@ -44,6 +44,8 @@
#include <vector>
#include <cuda_runtime.h>
#include "NPP_staging.hpp"
#include "opencv2/gpu/device/warp.hpp"
#include "opencv2/gpu/device/warp_shuffle.hpp"
texture<Ncv8u, 1, cudaReadModeElementType> tex8u;
@@ -90,6 +92,36 @@ NCV_CT_ASSERT(K_WARP_SIZE == 32); //this is required for the manual unroll of th
//assuming size <= WARP_SIZE and size is power of 2
template <class T>
inline __device__ T warpScanInclusive(T idata, volatile T *s_Data)
{
#if __CUDA_ARCH__ >= 300
const unsigned int laneId = cv::gpu::device::Warp::laneId();
// scan on shuffl functions
#pragma unroll
for (int i = 1; i <= (K_WARP_SIZE / 2); i *= 2)
{
const T n = cv::gpu::device::shfl_up(idata, i);
if (laneId >= i)
idata += n;
}
return idata;
#else
Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1));
s_Data[pos] = 0;
pos += K_WARP_SIZE;
s_Data[pos] = idata;
s_Data[pos] += s_Data[pos - 1];
s_Data[pos] += s_Data[pos - 2];
s_Data[pos] += s_Data[pos - 4];
s_Data[pos] += s_Data[pos - 8];
s_Data[pos] += s_Data[pos - 16];
return s_Data[pos];
#endif
}
inline __device__ Ncv64u warpScanInclusive(Ncv64u idata, volatile Ncv64u *s_Data)
{
Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1));
s_Data[pos] = 0;
@@ -2578,4 +2610,4 @@ NCVStatus nppiStResize_32f_C1R(const Ncv32f *pSrc,
return status;
}
#endif /* CUDA_DISABLER */
#endif /* CUDA_DISABLER */

View File

@@ -172,7 +172,7 @@ void cv::gpu::FarnebackOpticalFlow::updateFlow_boxFilter(
const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat &flowy,
GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[])
{
if (!isDeviceArch11_)
if (deviceSupports(FEATURE_SET_COMPUTE_12))
device::optflow_farneback::boxFilter5Gpu(M, blockSize/2, bufM, S(streams[0]));
else
device::optflow_farneback::boxFilter5Gpu_CC11(M, blockSize/2, bufM, S(streams[0]));
@@ -191,7 +191,7 @@ void cv::gpu::FarnebackOpticalFlow::updateFlow_gaussianBlur(
const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat& flowy,
GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[])
{
if (!isDeviceArch11_)
if (deviceSupports(FEATURE_SET_COMPUTE_12))
device::optflow_farneback::gaussianBlur5Gpu(
M, blockSize/2, bufM, BORDER_REPLICATE_GPU, S(streams[0]));
else
@@ -209,7 +209,7 @@ void cv::gpu::FarnebackOpticalFlow::updateFlow_gaussianBlur(
void cv::gpu::FarnebackOpticalFlow::operator ()(
const GpuMat &frame0, const GpuMat &frame1, GpuMat &flowx, GpuMat &flowy, Stream &s)
{
CV_Assert(frame0.type() == CV_8U && frame1.type() == CV_8U);
CV_Assert(frame0.channels() == 1 && frame1.channels() == 1);
CV_Assert(frame0.size() == frame1.size());
CV_Assert(polyN == 5 || polyN == 7);
CV_Assert(!fastPyramids || std::abs(pyrScale - 0.5) < 1e-6);

View File

@@ -53,35 +53,32 @@ void cv::gpu::PyrLKOpticalFlow::dense(const GpuMat&, const GpuMat&, GpuMat&, Gpu
#else /* !defined (HAVE_CUDA) */
namespace cv { namespace gpu { namespace device
namespace pyrlk
{
namespace pyrlk
{
void loadConstants(int2 winSize, int iters);
void loadConstants(int2 winSize, int iters);
void lkSparse1_gpu(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, dim3 patch, cudaStream_t stream = 0);
void lkSparse4_gpu(PtrStepSz<float4> I, PtrStepSz<float4> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, dim3 patch, cudaStream_t stream = 0);
void sparse1(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, dim3 patch, cudaStream_t stream = 0);
void sparse4(PtrStepSz<float4> I, PtrStepSz<float4> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, dim3 patch, cudaStream_t stream = 0);
void lkDense_gpu(PtrStepSzb I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV,
PtrStepSzf err, int2 winSize, cudaStream_t stream = 0);
}
}}}
void dense(PtrStepSzb I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV,
PtrStepSzf err, int2 winSize, cudaStream_t stream = 0);
}
namespace
{
void calcPatchSize(cv::Size winSize, dim3& block, dim3& patch, bool isDeviceArch11)
void calcPatchSize(cv::Size winSize, dim3& block, dim3& patch)
{
if (winSize.width > 32 && winSize.width > 2 * winSize.height)
{
block.x = isDeviceArch11 ? 16 : 32;
block.x = deviceSupports(FEATURE_SET_COMPUTE_12) ? 32 : 16;
block.y = 8;
}
else
{
block.x = 16;
block.y = isDeviceArch11 ? 8 : 16;
block.y = deviceSupports(FEATURE_SET_COMPUTE_12) ? 16 : 8;
}
patch.x = (winSize.width + block.x - 1) / block.x;
@@ -93,8 +90,6 @@ namespace
void cv::gpu::PyrLKOpticalFlow::sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts, GpuMat& status, GpuMat* err)
{
using namespace cv::gpu::device::pyrlk;
if (prevPts.empty())
{
nextPts.release();
@@ -104,9 +99,9 @@ void cv::gpu::PyrLKOpticalFlow::sparse(const GpuMat& prevImg, const GpuMat& next
}
dim3 block, patch;
calcPatchSize(winSize, block, patch, isDeviceArch11_);
calcPatchSize(winSize, block, patch);
CV_Assert(prevImg.type() == CV_8UC1 || prevImg.type() == CV_8UC3 || prevImg.type() == CV_8UC4);
CV_Assert(prevImg.channels() == 1 || prevImg.channels() == 3 || prevImg.channels() == 4);
CV_Assert(prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type());
CV_Assert(maxLevel >= 0);
CV_Assert(winSize.width > 2 && winSize.height > 2);
@@ -155,19 +150,19 @@ void cv::gpu::PyrLKOpticalFlow::sparse(const GpuMat& prevImg, const GpuMat& next
pyrDown(nextPyr_[level - 1], nextPyr_[level]);
}
loadConstants(make_int2(winSize.width, winSize.height), iters);
pyrlk::loadConstants(make_int2(winSize.width, winSize.height), iters);
for (int level = maxLevel; level >= 0; level--)
{
if (cn == 1)
{
lkSparse1_gpu(prevPyr_[level], nextPyr_[level],
pyrlk::sparse1(prevPyr_[level], nextPyr_[level],
prevPts.ptr<float2>(), nextPts.ptr<float2>(), status.ptr(), level == 0 && err ? err->ptr<float>() : 0, prevPts.cols,
level, block, patch);
}
else
{
lkSparse4_gpu(prevPyr_[level], nextPyr_[level],
pyrlk::sparse4(prevPyr_[level], nextPyr_[level],
prevPts.ptr<float2>(), nextPts.ptr<float2>(), status.ptr(), level == 0 && err ? err->ptr<float>() : 0, prevPts.cols,
level, block, patch);
}
@@ -176,8 +171,6 @@ void cv::gpu::PyrLKOpticalFlow::sparse(const GpuMat& prevImg, const GpuMat& next
void cv::gpu::PyrLKOpticalFlow::dense(const GpuMat& prevImg, const GpuMat& nextImg, GpuMat& u, GpuMat& v, GpuMat* err)
{
using namespace cv::gpu::device::pyrlk;
CV_Assert(prevImg.type() == CV_8UC1);
CV_Assert(prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type());
CV_Assert(maxLevel >= 0);
@@ -211,7 +204,7 @@ void cv::gpu::PyrLKOpticalFlow::dense(const GpuMat& prevImg, const GpuMat& nextI
vPyr_[1].setTo(Scalar::all(0));
int2 winSize2i = make_int2(winSize.width, winSize.height);
loadConstants(winSize2i, iters);
pyrlk::loadConstants(winSize2i, iters);
PtrStepSzf derr = err ? *err : PtrStepSzf();
@@ -221,7 +214,7 @@ void cv::gpu::PyrLKOpticalFlow::dense(const GpuMat& prevImg, const GpuMat& nextI
{
int idx2 = (idx + 1) & 1;
lkDense_gpu(prevPyr_[level], nextPyr_[level], uPyr_[idx], vPyr_[idx], uPyr_[idx2], vPyr_[idx2],
pyrlk::dense(prevPyr_[level], nextPyr_[level], uPyr_[idx], vPyr_[idx], uPyr_[idx2], vPyr_[idx2],
level == 0 ? derr : PtrStepSzf(), winSize2i);
if (level > 0)

View File

@@ -54,7 +54,7 @@ namespace cv { namespace gpu { namespace device
{
template <typename T>
void remap_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst,
int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
}
}}}
@@ -63,7 +63,7 @@ void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const Gp
using namespace cv::gpu::device::imgproc;
typedef void (*func_t)(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation,
int borderMode, const float* borderValue, cudaStream_t stream, int cc);
int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
static const func_t funcs[6][4] =
{
@@ -91,15 +91,12 @@ void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const Gp
Scalar_<float> borderValueFloat;
borderValueFloat = borderValue;
DeviceInfo info;
int cc = info.majorVersion() * 10 + info.minorVersion();
Size wholeSize;
Point ofs;
src.locateROI(wholeSize, ofs);
func(src, PtrStepSzb(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y, xmap, ymap,
dst, interpolation, gpuBorderType, borderValueFloat.val, StreamAccessor::getStream(stream), cc);
dst, interpolation, gpuBorderType, borderValueFloat.val, StreamAccessor::getStream(stream), deviceSupports(FEATURE_SET_COMPUTE_20));
}
#endif // HAVE_CUDA

View File

@@ -78,7 +78,7 @@ namespace
if (depth == CV_64F)
{
if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
if (!deviceSupports(NATIVE_DOUBLE))
CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
}
@@ -122,7 +122,7 @@ namespace
if (depth == CV_64F)
{
if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
if (!deviceSupports(NATIVE_DOUBLE))
CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
}

View File

@@ -86,8 +86,7 @@ namespace cv { namespace gpu { namespace device
void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures);
void compute_descriptors_gpu(const PtrStepSzf& descriptors,
const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures);
void compute_descriptors_gpu(PtrStepSz<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures);
}
}}}
@@ -122,9 +121,6 @@ namespace
CV_Assert(mask.empty() || (mask.size() == img.size() && mask.type() == CV_8UC1));
CV_Assert(surf_.nOctaves > 0 && surf_.nOctaveLayers > 0);
if (!TargetArchs::builtWith(GLOBAL_ATOMICS) || !DeviceInfo().supports(GLOBAL_ATOMICS))
CV_Error(CV_StsNotImplemented, "The device doesn't support global atomics");
const int min_size = calcSize(surf_.nOctaves - 1, 0);
CV_Assert(img_rows - min_size >= 0);
CV_Assert(img_cols - min_size >= 0);

View File

@@ -61,13 +61,13 @@ namespace cv { namespace gpu { namespace device
template <typename T>
void warpAffine_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
int borderMode, const float* borderValue, cudaStream_t stream, int cc);
int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
void buildWarpPerspectiveMaps_gpu(float coeffs[3 * 3], PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream);
template <typename T>
void warpPerspective_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation,
int borderMode, const float* borderValue, cudaStream_t stream, int cc);
int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
}
}}}
@@ -143,33 +143,31 @@ namespace
{
typedef typename NppWarpFunc<DEPTH>::npp_t npp_t;
static void call(const cv::gpu::GpuMat& src, cv::Size wholeSize, cv::Point ofs, cv::gpu::GpuMat& dst,
double coeffs[][3], cv::Size dsize, int interpolation, cudaStream_t stream)
static void call(const cv::gpu::GpuMat& src, cv::gpu::GpuMat& dst, double coeffs[][3], int interpolation, cudaStream_t stream)
{
static const int npp_inter[] = {NPPI_INTER_NN, NPPI_INTER_LINEAR, NPPI_INTER_CUBIC};
dst.create(dsize, src.type());
dst.setTo(cv::Scalar::all(0));
NppiSize srcsz;
srcsz.height = wholeSize.height;
srcsz.width = wholeSize.width;
srcsz.height = src.rows;
srcsz.width = src.cols;
NppiRect srcroi;
srcroi.x = ofs.x;
srcroi.y = ofs.y;
srcroi.x = 0;
srcroi.y = 0;
srcroi.height = src.rows;
srcroi.width = src.cols;
NppiRect dstroi;
dstroi.x = dstroi.y = 0;
dstroi.x = 0;
dstroi.y = 0;
dstroi.height = dst.rows;
dstroi.width = dst.cols;
cv::gpu::NppStreamHandler h(stream);
nppSafeCall( func((npp_t*)src.datastart, srcsz, static_cast<int>(src.step), srcroi,
dst.ptr<npp_t>(), static_cast<int>(dst.step), dstroi, coeffs, npp_inter[interpolation]) );
nppSafeCall( func(src.ptr<npp_t>(), srcsz, static_cast<int>(src.step), srcroi,
dst.ptr<npp_t>(), static_cast<int>(dst.step), dstroi,
coeffs, npp_inter[interpolation]) );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
@@ -187,6 +185,8 @@ void cv::gpu::warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsiz
CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC);
CV_Assert(borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP);
dst.create(dsize, src.type());
Size wholeSize;
Point ofs;
src.locateROI(wholeSize, ofs);
@@ -231,8 +231,7 @@ void cv::gpu::warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsiz
}
};
bool useNpp = borderMode == BORDER_CONSTANT;
useNpp = useNpp && useNppTab[src.depth()][src.channels() - 1][interpolation];
bool useNpp = borderMode == BORDER_CONSTANT && ofs.x == 0 && ofs.y == 0 && useNppTab[src.depth()][src.channels() - 1][interpolation];
#ifdef linux
// NPP bug on float data
useNpp = useNpp && src.depth() != CV_32F;
@@ -240,7 +239,7 @@ void cv::gpu::warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsiz
if (useNpp)
{
typedef void (*func_t)(const cv::gpu::GpuMat& src, cv::Size wholeSize, cv::Point ofs, cv::gpu::GpuMat& dst, double coeffs[][3], cv::Size dsize, int flags, cudaStream_t stream);
typedef void (*func_t)(const cv::gpu::GpuMat& src, cv::gpu::GpuMat& dst, double coeffs[][3], int flags, cudaStream_t stream);
static const func_t funcs[2][6][4] =
{
@@ -262,6 +261,8 @@ void cv::gpu::warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsiz
}
};
dst.setTo(borderValue);
double coeffs[2][3];
Mat coeffsMat(2, 3, CV_64F, (void*)coeffs);
M.convertTo(coeffsMat, coeffsMat.type());
@@ -269,14 +270,14 @@ void cv::gpu::warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsiz
const func_t func = funcs[(flags & WARP_INVERSE_MAP) != 0][src.depth()][src.channels() - 1];
CV_Assert(func != 0);
func(src, wholeSize, ofs, dst, coeffs, dsize, interpolation, StreamAccessor::getStream(s));
func(src, dst, coeffs, interpolation, StreamAccessor::getStream(s));
}
else
{
using namespace cv::gpu::device::imgproc;
typedef void (*func_t)(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
int borderMode, const float* borderValue, cudaStream_t stream, int cc);
int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
static const func_t funcs[6][4] =
{
@@ -294,8 +295,6 @@ void cv::gpu::warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsiz
int gpuBorderType;
CV_Assert(tryConvertToGpuBorderType(borderMode, gpuBorderType));
dst.create(dsize, src.type());
float coeffs[2 * 3];
Mat coeffsMat(2, 3, CV_32F, (void*)coeffs);
@@ -311,11 +310,8 @@ void cv::gpu::warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsiz
Scalar_<float> borderValueFloat;
borderValueFloat = borderValue;
DeviceInfo info;
int cc = info.majorVersion() * 10 + info.minorVersion();
func(src, PtrStepSzb(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y, coeffs,
dst, interpolation, gpuBorderType, borderValueFloat.val, StreamAccessor::getStream(s), cc);
dst, interpolation, gpuBorderType, borderValueFloat.val, StreamAccessor::getStream(s), deviceSupports(FEATURE_SET_COMPUTE_20));
}
}
@@ -329,6 +325,8 @@ void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size
CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC);
CV_Assert(borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP);
dst.create(dsize, src.type());
Size wholeSize;
Point ofs;
src.locateROI(wholeSize, ofs);
@@ -373,8 +371,7 @@ void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size
}
};
bool useNpp = borderMode == BORDER_CONSTANT;
useNpp = useNpp && useNppTab[src.depth()][src.channels() - 1][interpolation];
bool useNpp = borderMode == BORDER_CONSTANT && ofs.x == 0 && ofs.y == 0 && useNppTab[src.depth()][src.channels() - 1][interpolation];
#ifdef linux
// NPP bug on float data
useNpp = useNpp && src.depth() != CV_32F;
@@ -382,7 +379,7 @@ void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size
if (useNpp)
{
typedef void (*func_t)(const cv::gpu::GpuMat& src, cv::Size wholeSize, cv::Point ofs, cv::gpu::GpuMat& dst, double coeffs[][3], cv::Size dsize, int flags, cudaStream_t stream);
typedef void (*func_t)(const cv::gpu::GpuMat& src, cv::gpu::GpuMat& dst, double coeffs[][3], int flags, cudaStream_t stream);
static const func_t funcs[2][6][4] =
{
@@ -404,6 +401,8 @@ void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size
}
};
dst.setTo(borderValue);
double coeffs[3][3];
Mat coeffsMat(3, 3, CV_64F, (void*)coeffs);
M.convertTo(coeffsMat, coeffsMat.type());
@@ -411,14 +410,14 @@ void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size
const func_t func = funcs[(flags & WARP_INVERSE_MAP) != 0][src.depth()][src.channels() - 1];
CV_Assert(func != 0);
func(src, wholeSize, ofs, dst, coeffs, dsize, interpolation, StreamAccessor::getStream(s));
func(src, dst, coeffs, interpolation, StreamAccessor::getStream(s));
}
else
{
using namespace cv::gpu::device::imgproc;
typedef void (*func_t)(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
int borderMode, const float* borderValue, cudaStream_t stream, int cc);
int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
static const func_t funcs[6][4] =
{
@@ -436,8 +435,6 @@ void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size
int gpuBorderType;
CV_Assert(tryConvertToGpuBorderType(borderMode, gpuBorderType));
dst.create(dsize, src.type());
float coeffs[3 * 3];
Mat coeffsMat(3, 3, CV_32F, (void*)coeffs);
@@ -453,11 +450,8 @@ void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size
Scalar_<float> borderValueFloat;
borderValueFloat = borderValue;
DeviceInfo info;
int cc = info.majorVersion() * 10 + info.minorVersion();
func(src, PtrStepSzb(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y, coeffs,
dst, interpolation, gpuBorderType, borderValueFloat.val, StreamAccessor::getStream(s), cc);
dst, interpolation, gpuBorderType, borderValueFloat.val, StreamAccessor::getStream(s), deviceSupports(FEATURE_SET_COMPUTE_20));
}
}