modified according to CUDA 4.0 API updates

This commit is contained in:
Vladislav Vinogradov
2011-05-31 08:31:10 +00:00
parent 98d663e7e0
commit 926a6bba00
40 changed files with 1134 additions and 1818 deletions

View File

@@ -68,19 +68,22 @@ namespace cv { namespace gpu
template <typename T>
void blendLinearCaller(int rows, int cols, int cn, const PtrStep_<T> img1, const PtrStep_<T> img2,
const PtrStepf weights1, const PtrStepf weights2, PtrStep_<T> result)
const PtrStepf weights1, const PtrStepf weights2, PtrStep_<T> result, cudaStream_t stream)
{
dim3 threads(16, 16);
dim3 grid(divUp(cols * cn, threads.x), divUp(rows, threads.y));
blendLinearKernel<<<grid, threads>>>(rows, cols * cn, cn, img1, img2, weights1, weights2, result);
cudaSafeCall(cudaThreadSynchronize());
blendLinearKernel<<<grid, threads, 0, stream>>>(rows, cols * cn, cn, img1, img2, weights1, weights2, result);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
template void blendLinearCaller<uchar>(int, int, int, const PtrStep, const PtrStep,
const PtrStepf, const PtrStepf, PtrStep);
const PtrStepf, const PtrStepf, PtrStep, cudaStream_t stream);
template void blendLinearCaller<float>(int, int, int, const PtrStepf, const PtrStepf,
const PtrStepf, const PtrStepf, PtrStepf);
const PtrStepf, const PtrStepf, PtrStepf, cudaStream_t stream);
__global__ void blendLinearKernel8UC4(int rows, int cols, const PtrStep img1, const PtrStep img2,
@@ -105,13 +108,16 @@ namespace cv { namespace gpu
void blendLinearCaller8UC4(int rows, int cols, const PtrStep img1, const PtrStep img2,
const PtrStepf weights1, const PtrStepf weights2, PtrStep result)
const PtrStepf weights1, const PtrStepf weights2, PtrStep result, cudaStream_t stream)
{
dim3 threads(16, 16);
dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
blendLinearKernel8UC4<<<grid, threads>>>(rows, cols, img1, img2, weights1, weights2, result);
cudaSafeCall(cudaThreadSynchronize());
blendLinearKernel8UC4<<<grid, threads, 0, stream>>>(rows, cols, img1, img2, weights1, weights2, result);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
}}

View File

@@ -589,7 +589,7 @@ namespace cv { namespace gpu { namespace bfmatcher
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Train, typename Mask>
void matchSimple_caller(const DevMem2D_<T>& queryDescs, const Train& train,
const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)
const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, cudaStream_t stream)
{
StaticAssert<BLOCK_DIM_Y <= 64>::check(); // blockDimY vals must reduce by warp
@@ -597,14 +597,15 @@ namespace cv { namespace gpu { namespace bfmatcher
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
match<BLOCK_DIM_X, BLOCK_DIM_Y, ReduceDescCalculatorSimple<BLOCK_DIM_X, T>, Dist, T>
<<<grid, threads>>>(queryDescs, train, mask, trainIdx.data, imgIdx.data, distance.data);
<<<grid, threads, 0, stream>>>(queryDescs, train, mask, trainIdx.data, imgIdx.data, distance.data);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int MAX_DESCRIPTORS_LEN, bool DESC_LEN_EQ_MAX_LEN, typename Dist, typename T, typename Train, typename Mask>
void matchCached_caller(const DevMem2D_<T>& queryDescs, const Train& train,
const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)
const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, cudaStream_t stream)
{
StaticAssert<BLOCK_DIM_Y <= 64>::check(); // blockDimY vals must reduce by warp
StaticAssert<BLOCK_DIM_X * BLOCK_DIM_Y >= MAX_DESCRIPTORS_LEN>::check(); // block size must be greter than descriptors length
@@ -614,10 +615,11 @@ namespace cv { namespace gpu { namespace bfmatcher
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
match<BLOCK_DIM_X, BLOCK_DIM_Y, ReduceDescCalculatorCached<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, DESC_LEN_EQ_MAX_LEN, T, typename Dist::ValueType>, Dist, T>
<<<grid, threads>>>(queryDescs, train, mask, trainIdx.data, imgIdx.data, distance.data);
<<<grid, threads, 0, stream>>>(queryDescs, train, mask, trainIdx.data, imgIdx.data, distance.data);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
///////////////////////////////////////////////////////////////////////////////
@@ -626,167 +628,165 @@ namespace cv { namespace gpu { namespace bfmatcher
template <typename Dist, typename T, typename Train, typename Mask>
void matchDispatcher(const DevMem2D_<T>& queryDescs, const Train& train,
const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
bool cc_12)
bool cc_12, cudaStream_t stream)
{
if (queryDescs.cols < 64)
matchCached_caller<16, 16, 64, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);
matchCached_caller<16, 16, 64, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance, stream);
else if (queryDescs.cols == 64)
matchCached_caller<16, 16, 64, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);
matchCached_caller<16, 16, 64, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance, stream);
else if (queryDescs.cols < 128)
matchCached_caller<16, 16, 128, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);
matchCached_caller<16, 16, 128, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance, stream);
else if (queryDescs.cols == 128)
matchCached_caller<16, 16, 128, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);
matchCached_caller<16, 16, 128, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance, stream);
else if (queryDescs.cols < 256)
matchCached_caller<16, 16, 256, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);
matchCached_caller<16, 16, 256, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance, stream);
else if (queryDescs.cols == 256 && cc_12)
matchCached_caller<16, 16, 256, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);
matchCached_caller<16, 16, 256, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance, stream);
else
matchSimple_caller<16, 16, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);
cudaSafeCall( cudaThreadSynchronize() );
matchSimple_caller<16, 16, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance, stream);
}
template <typename T>
void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs,
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
bool cc_12)
bool cc_12, cudaStream_t stream)
{
SingleTrain<T> train((DevMem2D_<T>)trainDescs);
if (mask.data)
{
SingleMask m(mask);
matchDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12);
matchDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12, stream);
}
else
{
matchDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);
matchDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12, stream);
}
}
template void matchSingleL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
template void matchSingleL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
template void matchSingleL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
template void matchSingleL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
template void matchSingleL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
template void matchSingleL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
template void matchSingleL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchSingleL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchSingleL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchSingleL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchSingleL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchSingleL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template <typename T>
void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs,
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
bool cc_12)
bool cc_12, cudaStream_t stream)
{
SingleTrain<T> train((DevMem2D_<T>)trainDescs);
if (mask.data)
{
SingleMask m(mask);
matchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12);
matchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12, stream);
}
else
{
matchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);
matchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12, stream);
}
}
template void matchSingleL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
template void matchSingleL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
template void matchSingleL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
template void matchSingleL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
template void matchSingleL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
template void matchSingleL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
template void matchSingleL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchSingleL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchSingleL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchSingleL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchSingleL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchSingleL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template <typename T>
void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs,
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
bool cc_12)
bool cc_12, cudaStream_t stream)
{
SingleTrain<T> train((DevMem2D_<T>)trainDescs);
if (mask.data)
{
SingleMask m(mask);
matchDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12);
matchDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12, stream);
}
else
{
matchDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);
matchDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12, stream);
}
}
template void matchSingleHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
template void matchSingleHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
template void matchSingleHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
template void matchSingleHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
template void matchSingleHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
template void matchSingleHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchSingleHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchSingleHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchSingleHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchSingleHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template <typename T>
void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection,
const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx,
const DevMem2Df& distance, bool cc_12)
const DevMem2Df& distance, bool cc_12, cudaStream_t stream)
{
TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, queryDescs.cols);
if (maskCollection.data)
{
MaskCollection mask(maskCollection.data);
matchDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12);
matchDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12, stream);
}
else
{
matchDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);
matchDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12, stream);
}
}
template void matchCollectionL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
template void matchCollectionL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
template void matchCollectionL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
template void matchCollectionL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
template void matchCollectionL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
template void matchCollectionL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
template void matchCollectionL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchCollectionL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchCollectionL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchCollectionL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchCollectionL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchCollectionL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template <typename T>
void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection,
const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx,
const DevMem2Df& distance, bool cc_12)
const DevMem2Df& distance, bool cc_12, cudaStream_t stream)
{
TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, queryDescs.cols);
if (maskCollection.data)
{
MaskCollection mask(maskCollection.data);
matchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12);
matchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12, stream);
}
else
{
matchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);
matchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12, stream);
}
}
template void matchCollectionL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
template void matchCollectionL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
template void matchCollectionL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
template void matchCollectionL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
template void matchCollectionL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
template void matchCollectionL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
template void matchCollectionL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchCollectionL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchCollectionL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchCollectionL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchCollectionL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchCollectionL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template <typename T>
void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection,
const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx,
const DevMem2Df& distance, bool cc_12)
const DevMem2Df& distance, bool cc_12, cudaStream_t stream)
{
TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, queryDescs.cols);
if (maskCollection.data)
{
MaskCollection mask(maskCollection.data);
matchDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12);
matchDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12, stream);
}
else
{
matchDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);
matchDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12, stream);
}
}
template void matchCollectionHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
template void matchCollectionHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
template void matchCollectionHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
template void matchCollectionHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
template void matchCollectionHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
template void matchCollectionHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchCollectionHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchCollectionHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchCollectionHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchCollectionHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
///////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////// Knn Match ////////////////////////////////////
@@ -833,16 +833,17 @@ namespace cv { namespace gpu { namespace bfmatcher
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
void calcDistance_caller(const DevMem2D_<T>& queryDescs, const DevMem2D_<T>& trainDescs,
const Mask& mask, const DevMem2Df& distance)
const Mask& mask, const DevMem2Df& distance, cudaStream_t stream)
{
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
dim3 grid(queryDescs.rows, divUp(trainDescs.rows, BLOCK_DIM_Y), 1);
calcDistance<BLOCK_DIM_X, BLOCK_DIM_Y, Dist, T><<<grid, threads>>>(
calcDistance<BLOCK_DIM_X, BLOCK_DIM_Y, Dist, T><<<grid, threads, 0, stream>>>(
queryDescs, trainDescs, mask, distance);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
///////////////////////////////////////////////////////////////////////////////
@@ -1010,105 +1011,106 @@ namespace cv { namespace gpu { namespace bfmatcher
// find knn match kernel caller
template <int BLOCK_SIZE>
void findKnnMatch_caller(int knn, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist)
void findKnnMatch_caller(int knn, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream)
{
dim3 threads(BLOCK_SIZE, 1, 1);
dim3 grid(trainIdx.rows, 1, 1);
for (int i = 0; i < knn; ++i)
{
findBestMatch<BLOCK_SIZE><<<grid, threads>>>(allDist, i, trainIdx, distance);
findBestMatch<BLOCK_SIZE><<<grid, threads, 0, stream>>>(allDist, i, trainIdx, distance);
cudaSafeCall( cudaGetLastError() );
}
cudaSafeCall( cudaThreadSynchronize() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
///////////////////////////////////////////////////////////////////////////////
// knn match caller
template <typename Dist, typename T, typename Mask>
void calcDistanceDispatcher(const DevMem2D_<T>& queryDescs, const DevMem2D_<T>& trainDescs, const Mask& mask, const DevMem2Df& allDist)
void calcDistanceDispatcher(const DevMem2D_<T>& queryDescs, const DevMem2D_<T>& trainDescs, const Mask& mask, const DevMem2Df& allDist, cudaStream_t stream)
{
calcDistance_caller<16, 16, Dist>(queryDescs, trainDescs, mask, allDist);
calcDistance_caller<16, 16, Dist>(queryDescs, trainDescs, mask, allDist, stream);
}
void findKnnMatchDispatcher(int knn, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist)
void findKnnMatchDispatcher(int knn, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream)
{
findKnnMatch_caller<256>(knn, trainIdx, distance, allDist);
findKnnMatch_caller<256>(knn, trainIdx, distance, allDist, stream);
}
template <typename T>
void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist)
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream)
{
if (mask.data)
{
calcDistanceDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, SingleMask(mask), allDist);
calcDistanceDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, SingleMask(mask), allDist, stream);
}
else
{
calcDistanceDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, WithOutMask(), allDist);
calcDistanceDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, WithOutMask(), allDist, stream);
}
findKnnMatchDispatcher(knn, trainIdx, distance, allDist);
findKnnMatchDispatcher(knn, trainIdx, distance, allDist, stream);
}
template void knnMatchL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template void knnMatchL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template void knnMatchL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template void knnMatchL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template void knnMatchL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template void knnMatchL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template void knnMatchL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);
template void knnMatchL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);
template void knnMatchL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);
template void knnMatchL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);
template void knnMatchL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);
template void knnMatchL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);
template <typename T>
void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist)
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream)
{
if (mask.data)
{
calcDistanceDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
SingleMask(mask), allDist);
SingleMask(mask), allDist, stream);
}
else
{
calcDistanceDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
WithOutMask(), allDist);
WithOutMask(), allDist, stream);
}
findKnnMatchDispatcher(knn, trainIdx, distance, allDist);
findKnnMatchDispatcher(knn, trainIdx, distance, allDist, stream);
}
template void knnMatchL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template void knnMatchL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template void knnMatchL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template void knnMatchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template void knnMatchL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template void knnMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template void knnMatchL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);
template void knnMatchL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);
template void knnMatchL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);
template void knnMatchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);
template void knnMatchL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);
template void knnMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);
template <typename T>
void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist)
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream)
{
if (mask.data)
{
calcDistanceDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
SingleMask(mask), allDist);
SingleMask(mask), allDist, stream);
}
else
{
calcDistanceDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
WithOutMask(), allDist);
WithOutMask(), allDist, stream);
}
findKnnMatchDispatcher(knn, trainIdx, distance, allDist);
findKnnMatchDispatcher(knn, trainIdx, distance, allDist, stream);
}
template void knnMatchHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template void knnMatchHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template void knnMatchHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template void knnMatchHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template void knnMatchHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template void knnMatchHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);
template void knnMatchHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);
template void knnMatchHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);
template void knnMatchHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);
template void knnMatchHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);
///////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////// Radius Match //////////////////////////////////
@@ -1166,16 +1168,17 @@ namespace cv { namespace gpu { namespace bfmatcher
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
void radiusMatch_caller(const DevMem2D_<T>& queryDescs, const DevMem2D_<T>& trainDescs,
float maxDistance, const Mask& mask, const DevMem2Di& trainIdx, unsigned int* nMatches,
const DevMem2Df& distance)
const DevMem2Df& distance, cudaStream_t stream)
{
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
dim3 grid(queryDescs.rows, divUp(trainDescs.rows, BLOCK_DIM_Y), 1);
radiusMatch<BLOCK_DIM_X, BLOCK_DIM_Y, Dist, T><<<grid, threads>>>(
radiusMatch<BLOCK_DIM_X, BLOCK_DIM_Y, Dist, T><<<grid, threads, 0, stream>>>(
queryDescs, trainDescs, maxDistance, mask, trainIdx, nMatches, distance);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
///////////////////////////////////////////////////////////////////////////////
@@ -1184,77 +1187,77 @@ namespace cv { namespace gpu { namespace bfmatcher
template <typename Dist, typename T, typename Mask>
void radiusMatchDispatcher(const DevMem2D_<T>& queryDescs, const DevMem2D_<T>& trainDescs,
float maxDistance, const Mask& mask, const DevMem2Di& trainIdx, unsigned int* nMatches,
const DevMem2Df& distance)
const DevMem2Df& distance, cudaStream_t stream)
{
radiusMatch_caller<16, 16, Dist>(queryDescs, trainDescs, maxDistance, mask,
trainIdx, nMatches, distance);
trainIdx, nMatches, distance, stream);
}
template <typename T>
void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,
const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance)
const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream)
{
if (mask.data)
{
radiusMatchDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
maxDistance, SingleMask(mask), trainIdx, nMatches, distance);
maxDistance, SingleMask(mask), trainIdx, nMatches, distance, stream);
}
else
{
radiusMatchDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
maxDistance, WithOutMask(), trainIdx, nMatches, distance);
maxDistance, WithOutMask(), trainIdx, nMatches, distance, stream);
}
}
template void radiusMatchL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template void radiusMatchL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template void radiusMatchL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template void radiusMatchL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template void radiusMatchL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template void radiusMatchL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template void radiusMatchL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
template void radiusMatchL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
template void radiusMatchL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
template void radiusMatchL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
template void radiusMatchL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
template void radiusMatchL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
template <typename T>
void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,
const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance)
const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream)
{
if (mask.data)
{
radiusMatchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
maxDistance, SingleMask(mask), trainIdx, nMatches, distance);
maxDistance, SingleMask(mask), trainIdx, nMatches, distance, stream);
}
else
{
radiusMatchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
maxDistance, WithOutMask(), trainIdx, nMatches, distance);
maxDistance, WithOutMask(), trainIdx, nMatches, distance, stream);
}
}
template void radiusMatchL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template void radiusMatchL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template void radiusMatchL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template void radiusMatchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template void radiusMatchL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template void radiusMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template void radiusMatchL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
template void radiusMatchL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
template void radiusMatchL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
template void radiusMatchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
template void radiusMatchL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
template void radiusMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
template <typename T>
void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,
const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance)
const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream)
{
if (mask.data)
{
radiusMatchDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
maxDistance, SingleMask(mask), trainIdx, nMatches, distance);
maxDistance, SingleMask(mask), trainIdx, nMatches, distance, stream);
}
else
{
radiusMatchDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
maxDistance, WithOutMask(), trainIdx, nMatches, distance);
maxDistance, WithOutMask(), trainIdx, nMatches, distance, stream);
}
}
template void radiusMatchHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template void radiusMatchHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template void radiusMatchHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template void radiusMatchHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template void radiusMatchHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template void radiusMatchHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
template void radiusMatchHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
template void radiusMatchHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
template void radiusMatchHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
template void radiusMatchHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
}}}

View File

@@ -184,7 +184,9 @@ namespace cv { namespace gpu
computeHypothesisScoresKernel<<<grid, threads, smem_size>>>(
num_points, object, image, dist_threshold, hypothesis_scores);
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
} // namespace solvepnp_ransac

View File

@@ -64,19 +64,19 @@ namespace cv { namespace gpu { namespace mathfunc
};
template <typename T1, typename T2>
inline void compare_ne(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst)
inline void compare_ne(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream)
{
NotEqual<T1, T2> op;
transform(static_cast< DevMem2D_<T1> >(src1), static_cast< DevMem2D_<T2> >(src2), dst, op, 0);
transform(static_cast< DevMem2D_<T1> >(src1), static_cast< DevMem2D_<T2> >(src2), dst, op, stream);
}
void compare_ne_8uc4(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst)
void compare_ne_8uc4(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream)
{
compare_ne<uint, uint>(src1, src2, dst);
compare_ne<uint, uint>(src1, src2, dst, stream);
}
void compare_ne_32f(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst)
void compare_ne_32f(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream)
{
compare_ne<float, float>(src1, src2, dst);
compare_ne<float, float>(src1, src2, dst, stream);
}
@@ -133,7 +133,7 @@ namespace cv { namespace gpu { namespace mathfunc
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaDeviceSynchronize() );
}
@@ -165,7 +165,7 @@ namespace cv { namespace gpu { namespace mathfunc
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaDeviceSynchronize() );
}
@@ -256,7 +256,7 @@ namespace cv { namespace gpu { namespace mathfunc
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaDeviceSynchronize() );
}
@@ -290,7 +290,7 @@ namespace cv { namespace gpu { namespace mathfunc
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaDeviceSynchronize() );
}

View File

@@ -93,9 +93,9 @@ namespace filter_krnls
typedef typename SmemType<T>::smem_t smem_t;
__shared__ smem_t smem[BLOCK_DIM_Y * BLOCK_DIM_X * 3];
const int x = BLOCK_DIM_X * blockIdx.x + threadIdx.x;
const int y = BLOCK_DIM_Y * blockIdx.y + threadIdx.y;
const int x = BLOCK_DIM_X * blockIdx.x + threadIdx.x;
const int y = BLOCK_DIM_Y * blockIdx.y + threadIdx.y;
smem_t* sDataRow = smem + threadIdx.y * BLOCK_DIM_X * 3;
@@ -129,7 +129,7 @@ namespace filter_krnls
namespace cv { namespace gpu { namespace filters
{
template <int ksize, typename T, typename D, template<typename> class B>
void linearRowFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor)
void linearRowFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream)
{
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);
dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
@@ -143,16 +143,17 @@ namespace cv { namespace gpu { namespace filters
"try bigger image or another border extrapolation mode", __FILE__, __LINE__);
}
filter_krnls::linearRowFilter<ksize, T, D><<<grid, threads>>>(src, dst, anchor, b);
filter_krnls::linearRowFilter<ksize, T, D><<<grid, threads, 0, stream>>>(src, dst, anchor, b);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <typename T, typename D>
void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type)
void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream)
{
typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor);
typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream);
static const caller_t callers[3][17] =
{
{
@@ -173,7 +174,7 @@ namespace cv { namespace gpu { namespace filters
linearRowFilter_caller<14, T, D, BrdRowReflect101>,
linearRowFilter_caller<15, T, D, BrdRowReflect101>,
linearRowFilter_caller<16, T, D, BrdRowReflect101>,
},
},
{
0,
linearRowFilter_caller<1 , T, D, BrdRowReplicate>,
@@ -192,7 +193,7 @@ namespace cv { namespace gpu { namespace filters
linearRowFilter_caller<14, T, D, BrdRowReplicate>,
linearRowFilter_caller<15, T, D, BrdRowReplicate>,
linearRowFilter_caller<16, T, D, BrdRowReplicate>,
},
},
{
0,
linearRowFilter_caller<1 , T, D, BrdRowConstant>,
@@ -216,15 +217,15 @@ namespace cv { namespace gpu { namespace filters
loadLinearKernel(kernel, ksize);
callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor);
callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);
}
template void linearRowFilter_gpu<uchar , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);
template void linearRowFilter_gpu<uchar4, float4>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);
template void linearRowFilter_gpu<short , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);;
template void linearRowFilter_gpu<short2, float2>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);
template void linearRowFilter_gpu<int , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);
template void linearRowFilter_gpu<float , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);
template void linearRowFilter_gpu<uchar , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearRowFilter_gpu<uchar4, float4>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearRowFilter_gpu<short , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearRowFilter_gpu<short2, float2>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearRowFilter_gpu<int , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearRowFilter_gpu<float , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
}}}
namespace filter_krnls
@@ -233,9 +234,9 @@ namespace filter_krnls
__global__ void linearColumnFilter(const DevMem2D_<T> src, PtrStep_<D> dst, int anchor, const B b)
{
__shared__ T smem[BLOCK_DIM_Y * BLOCK_DIM_X * 3];
const int x = BLOCK_DIM_X * blockIdx.x + threadIdx.x;
const int y = BLOCK_DIM_Y * blockIdx.y + threadIdx.y;
const int x = BLOCK_DIM_X * blockIdx.x + threadIdx.x;
const int y = BLOCK_DIM_Y * blockIdx.y + threadIdx.y;
T* sDataColumn = smem + threadIdx.x;
@@ -269,7 +270,7 @@ namespace filter_krnls
namespace cv { namespace gpu { namespace filters
{
template <int ksize, typename T, typename D, template<typename> class B>
void linearColumnFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor)
void linearColumnFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream)
{
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);
dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
@@ -282,16 +283,17 @@ namespace cv { namespace gpu { namespace filters
"try bigger image or another border extrapolation mode", __FILE__, __LINE__);
}
filter_krnls::linearColumnFilter<ksize, T, D><<<grid, threads>>>(src, dst, anchor, b);
filter_krnls::linearColumnFilter<ksize, T, D><<<grid, threads, 0, stream>>>(src, dst, anchor, b);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <typename T, typename D>
void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type)
void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream)
{
typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor);
typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream);
static const caller_t callers[3][17] =
{
{
@@ -312,7 +314,7 @@ namespace cv { namespace gpu { namespace filters
linearColumnFilter_caller<14, T, D, BrdColReflect101>,
linearColumnFilter_caller<15, T, D, BrdColReflect101>,
linearColumnFilter_caller<16, T, D, BrdColReflect101>,
},
},
{
0,
linearColumnFilter_caller<1 , T, D, BrdColReplicate>,
@@ -331,7 +333,7 @@ namespace cv { namespace gpu { namespace filters
linearColumnFilter_caller<14, T, D, BrdColReplicate>,
linearColumnFilter_caller<15, T, D, BrdColReplicate>,
linearColumnFilter_caller<16, T, D, BrdColReplicate>,
},
},
{
0,
linearColumnFilter_caller<1 , T, D, BrdColConstant>,
@@ -355,15 +357,15 @@ namespace cv { namespace gpu { namespace filters
loadLinearKernel(kernel, ksize);
callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor);
callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);
}
template void linearColumnFilter_gpu<float , uchar >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);
template void linearColumnFilter_gpu<float4, uchar4>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);
template void linearColumnFilter_gpu<float , short >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);
template void linearColumnFilter_gpu<float2, short2>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);
template void linearColumnFilter_gpu<float , int >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);
template void linearColumnFilter_gpu<float , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);
template void linearColumnFilter_gpu<float , uchar >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearColumnFilter_gpu<float4, uchar4>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearColumnFilter_gpu<float , short >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearColumnFilter_gpu<float2, short2>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearColumnFilter_gpu<float , int >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearColumnFilter_gpu<float , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
}}}
/////////////////////////////////////////////////////////////////////////////////////////////////
@@ -390,10 +392,10 @@ namespace cv { namespace gpu { namespace bf
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space, &table_space.data, sizeof(table_space.data)) );
size_t table_space_step = table_space.step / sizeof(float);
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space_step, &table_space_step, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cndisp, &ndisp, sizeof(int)) );
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cradius, &radius, sizeof(int)) );
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cedge_disc, &edge_disc, sizeof(short)) );
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cmax_disc, &max_disc, sizeof(short)) );
}
@@ -538,10 +540,10 @@ namespace cv { namespace gpu { namespace bf
break;
default:
cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);
}
}
if (stream != 0)
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaDeviceSynchronize() );
}
void bilateral_filter_gpu(const DevMem2D& disp, const DevMem2D& img, int channels, int iters, cudaStream_t stream)

View File

@@ -220,7 +220,7 @@ void compute_hists(int nbins, int block_stride_x, int block_stride_y,
img_block_width, grad, qangle, scale, block_hists);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaDeviceSynchronize() );
}
@@ -324,7 +324,7 @@ void normalize_hists(int nbins, int block_stride_x, int block_stride_y,
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaDeviceSynchronize() );
}
@@ -418,7 +418,7 @@ void classify_hists(int win_height, int win_width, int block_stride_y, int block
block_hists, coefs, free_coef, threshold, labels);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaDeviceSynchronize() );
}
//----------------------------------------------------------------------------
@@ -463,7 +463,7 @@ void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, i
img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaDeviceSynchronize() );
}
@@ -512,7 +512,7 @@ void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, i
img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaDeviceSynchronize() );
}
//----------------------------------------------------------------------------
@@ -636,7 +636,8 @@ void compute_gradients_8UC4(int nbins, int height, int width, const DevMem2D& im
compute_gradients_8UC4_kernel<nthreads, 0><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaDeviceSynchronize() );
}
template <int nthreads, int correct_gamma>
@@ -707,7 +708,8 @@ void compute_gradients_8UC1(int nbins, int height, int width, const DevMem2D& im
compute_gradients_8UC1_kernel<nthreads, 0><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaDeviceSynchronize() );
}
@@ -765,7 +767,9 @@ static void resize_for_hog(const DevMem2D& src, DevMem2D dst, TEX& tex)
resize_for_hog_kernel<<<grid, threads>>>(sx, sy, (DevMem2D_<T>)dst, colOfs);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaDeviceSynchronize() );
cudaSafeCall( cudaUnbindTexture(tex) );
}

View File

@@ -139,7 +139,7 @@ namespace cv { namespace gpu { namespace imgproc
remap_1c<<<grid, threads>>>(xmap.data, ymap.data, xmap.step, dst.data, dst.step, dst.cols, dst.rows);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaDeviceSynchronize() );
cudaSafeCall( cudaUnbindTexture(tex_remap) );
}
@@ -153,7 +153,7 @@ namespace cv { namespace gpu { namespace imgproc
remap_3c<<<grid, threads>>>(src.data, src.step, xmap.data, ymap.data, xmap.step, dst.data, dst.step, dst.cols, dst.rows);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaDeviceSynchronize() );
}
/////////////////////////////////// MeanShiftfiltering ///////////////////////////////////////////////
@@ -263,7 +263,7 @@ namespace cv { namespace gpu { namespace imgproc
meanshift_kernel<<< grid, threads >>>( dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaDeviceSynchronize() );
cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );
}
extern "C" void meanShiftProc_gpu(const DevMem2D& src, DevMem2D dstr, DevMem2D dstsp, int sp, int sr, int maxIter, float eps)
@@ -279,7 +279,7 @@ namespace cv { namespace gpu { namespace imgproc
meanshiftproc_kernel<<< grid, threads >>>( dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps );
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaDeviceSynchronize() );
cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );
}
@@ -397,7 +397,7 @@ namespace cv { namespace gpu { namespace imgproc
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaDeviceSynchronize() );
}
void drawColorDisp_gpu(const DevMem2D_<short>& src, const DevMem2D& dst, int ndisp, const cudaStream_t& stream)
@@ -411,7 +411,7 @@ namespace cv { namespace gpu { namespace imgproc
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaDeviceSynchronize() );
}
/////////////////////////////////// reprojectImageTo3D ///////////////////////////////////////////////
@@ -462,7 +462,7 @@ namespace cv { namespace gpu { namespace imgproc
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaDeviceSynchronize() );
}
void reprojectImageTo3D_gpu(const DevMem2D& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream)
@@ -502,7 +502,7 @@ namespace cv { namespace gpu { namespace imgproc
extractCovData_kernel<<<grid, threads>>>(Dx.cols, Dx.rows, Dx, Dy, dst);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaDeviceSynchronize() );
}
/////////////////////////////////////////// Corner Harris /////////////////////////////////////////////////
@@ -611,7 +611,8 @@ namespace cv { namespace gpu { namespace imgproc
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaDeviceSynchronize() );
cudaSafeCall(cudaUnbindTexture(harrisDxTex));
cudaSafeCall(cudaUnbindTexture(harrisDyTex));
}
@@ -727,7 +728,8 @@ namespace cv { namespace gpu { namespace imgproc
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall(cudaDeviceSynchronize());
cudaSafeCall(cudaUnbindTexture(minEigenValDxTex));
cudaSafeCall(cudaUnbindTexture(minEigenValDyTex));
}
@@ -763,7 +765,7 @@ namespace cv { namespace gpu { namespace imgproc
column_sumKernel_32F<<<grid, threads>>>(src.cols, src.rows, src, dst);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaDeviceSynchronize() );
}
//////////////////////////////////////////////////////////////////////////
@@ -791,7 +793,7 @@ namespace cv { namespace gpu { namespace imgproc
mulSpectrumsKernel<<<grid, threads>>>(a, b, c);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaDeviceSynchronize() );
}
//////////////////////////////////////////////////////////////////////////
@@ -820,7 +822,7 @@ namespace cv { namespace gpu { namespace imgproc
mulSpectrumsKernel_CONJ<<<grid, threads>>>(a, b, c);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaDeviceSynchronize() );
}
//////////////////////////////////////////////////////////////////////////
@@ -850,7 +852,7 @@ namespace cv { namespace gpu { namespace imgproc
mulAndScaleSpectrumsKernel<<<grid, threads>>>(a, b, scale, c);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaDeviceSynchronize() );
}
//////////////////////////////////////////////////////////////////////////
@@ -880,7 +882,7 @@ namespace cv { namespace gpu { namespace imgproc
mulAndScaleSpectrumsKernel_CONJ<<<grid, threads>>>(a, b, scale, c);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaDeviceSynchronize() );
}
/////////////////////////////////////////////////////////////////////////
@@ -904,7 +906,9 @@ namespace cv { namespace gpu { namespace imgproc
dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
downsampleKernel<<<grid, threads>>>(src, rows, cols, k, dst);
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
template void downsampleCaller(const PtrStep src, int rows, int cols, int k, PtrStep dst);

View File

@@ -46,6 +46,8 @@
#include "opencv2/gpu/devmem2d.hpp"
#include "safe_call.hpp"
#include "cuda_runtime.h"
#include "npp.h"
#include "NPP_staging.hpp"
namespace cv
{
@@ -106,6 +108,41 @@ namespace cv
cudaSafeCall( cudaGetTextureReference(&tex, name) );
cudaSafeCall( cudaUnbindTexture(tex) );
}
class NppStreamHandler
{
public:
inline explicit NppStreamHandler(cudaStream_t newStream = 0)
{
oldStream = nppGetStream();
nppSetStream(newStream);
}
inline ~NppStreamHandler()
{
nppSetStream(oldStream);
}
private:
cudaStream_t oldStream;
};
class NppStStreamHandler
{
public:
inline explicit NppStStreamHandler(cudaStream_t newStream = 0)
{
oldStream = nppStSetActiveCUDAstream(newStream);
}
inline ~NppStStreamHandler()
{
nppStSetActiveCUDAstream(oldStream);
}
private:
cudaStream_t oldStream;
};
}
}

View File

@@ -134,7 +134,7 @@ void matchTemplateNaive_CCORR_32F(const DevMem2D image, const DevMem2D templ,
}
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaDeviceSynchronize() );
}
@@ -165,7 +165,7 @@ void matchTemplateNaive_CCORR_8U(const DevMem2D image, const DevMem2D templ,
}
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaDeviceSynchronize() );
}
@@ -228,7 +228,7 @@ void matchTemplateNaive_SQDIFF_32F(const DevMem2D image, const DevMem2D templ,
}
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaDeviceSynchronize() );
}
@@ -259,7 +259,7 @@ void matchTemplateNaive_SQDIFF_8U(const DevMem2D image, const DevMem2D templ,
}
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaDeviceSynchronize() );
}
@@ -309,7 +309,7 @@ void matchTemplatePrepared_SQDIFF_8U(
}
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaDeviceSynchronize() );
}
@@ -360,7 +360,7 @@ void matchTemplatePrepared_SQDIFF_NORMED_8U(
}
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaDeviceSynchronize() );
}
@@ -392,7 +392,7 @@ void matchTemplatePrepared_CCOFF_8U(
w, h, (float)templ_sum / (w * h), image_sum, result);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaDeviceSynchronize() );
}
@@ -434,7 +434,7 @@ void matchTemplatePrepared_CCOFF_8UC2(
image_sum_r, image_sum_g, result);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaDeviceSynchronize() );
}
@@ -490,7 +490,7 @@ void matchTemplatePrepared_CCOFF_8UC3(
image_sum_r, image_sum_g, image_sum_b, result);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaDeviceSynchronize() );
}
@@ -556,7 +556,7 @@ void matchTemplatePrepared_CCOFF_8UC4(
result);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaDeviceSynchronize() );
}
@@ -602,7 +602,7 @@ void matchTemplatePrepared_CCOFF_NORMED_8U(
image_sum, image_sqsum, result);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaDeviceSynchronize() );
}
@@ -665,7 +665,7 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC2(
result);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaDeviceSynchronize() );
}
@@ -742,7 +742,7 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC3(
result);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaDeviceSynchronize() );
}
@@ -833,7 +833,7 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC4(
result);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaDeviceSynchronize() );
}
@@ -877,7 +877,7 @@ void normalize_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum,
}
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaDeviceSynchronize() );
}
@@ -919,7 +919,7 @@ void extractFirstChannel_32F(const DevMem2D image, DevMem2Df result, int cn)
}
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaDeviceSynchronize() );
}

View File

@@ -153,7 +153,7 @@ namespace cv { namespace gpu { namespace mathfunc
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaDeviceSynchronize() );
}
void cartToPolar_gpu(const DevMem2Df& x, const DevMem2Df& y, const DevMem2Df& mag, bool magSqr, const DevMem2Df& angle, bool angleInDegrees, cudaStream_t stream)
@@ -202,7 +202,7 @@ namespace cv { namespace gpu { namespace mathfunc
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaDeviceSynchronize() );
}
void polarToCart_gpu(const DevMem2Df& mag, const DevMem2Df& angle, const DevMem2Df& x, const DevMem2Df& y, bool angleInDegrees, cudaStream_t stream)

View File

@@ -87,7 +87,7 @@ namespace cv { namespace gpu { namespace matrix_operations {
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall ( cudaThreadSynchronize() );
cudaSafeCall ( cudaDeviceSynchronize() );
}
void copy_to_with_mask(const DevMem2D& mat_src, DevMem2D mat_dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream)
@@ -199,7 +199,7 @@ namespace cv { namespace gpu { namespace matrix_operations {
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall ( cudaThreadSynchronize() );
cudaSafeCall ( cudaDeviceSynchronize() );
}
template void set_to_gpu<uchar >(const DevMem2D& mat, const uchar* scalar, const DevMem2D& mask, int channels, cudaStream_t stream);
@@ -222,7 +222,7 @@ namespace cv { namespace gpu { namespace matrix_operations {
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall ( cudaThreadSynchronize() );
cudaSafeCall ( cudaDeviceSynchronize() );
}
template void set_to_gpu<uchar >(const DevMem2D& mat, const uchar* scalar, int channels, cudaStream_t stream);

View File

@@ -275,11 +275,11 @@ namespace cv { namespace gpu { namespace mathfunc
minMaxKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaDeviceSynchronize() );
T minval_, maxval_;
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
*minval = minval_;
*maxval = maxval_;
}
@@ -306,11 +306,11 @@ namespace cv { namespace gpu { namespace mathfunc
minMaxKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaDeviceSynchronize() );
T minval_, maxval_;
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
*minval = minval_;
*maxval = maxval_;
}
@@ -363,11 +363,11 @@ namespace cv { namespace gpu { namespace mathfunc
minMaxPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall(cudaDeviceSynchronize());
T minval_, maxval_;
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
*minval = minval_;
*maxval = maxval_;
}
@@ -395,11 +395,11 @@ namespace cv { namespace gpu { namespace mathfunc
minMaxPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaDeviceSynchronize() );
T minval_, maxval_;
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
*minval = minval_;
*maxval = maxval_;
}
@@ -609,17 +609,17 @@ namespace cv { namespace gpu { namespace mathfunc
minloc_buf, maxloc_buf);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaDeviceSynchronize() );
T minval_, maxval_;
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
*minval = minval_;
*maxval = maxval_;
uint minloc_, maxloc_;
cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
cudaSafeCall( cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost) );
cudaSafeCall( cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost) );
minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
}
@@ -650,7 +650,7 @@ namespace cv { namespace gpu { namespace mathfunc
minloc_buf, maxloc_buf);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaDeviceSynchronize() );
T minval_, maxval_;
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
@@ -724,7 +724,7 @@ namespace cv { namespace gpu { namespace mathfunc
minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaDeviceSynchronize() );
T minval_, maxval_;
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
@@ -766,7 +766,7 @@ namespace cv { namespace gpu { namespace mathfunc
minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaDeviceSynchronize() );
T minval_, maxval_;
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
@@ -895,7 +895,7 @@ namespace cv { namespace gpu { namespace mathfunc
countNonZeroKernel<256, T><<<grid, threads>>>(src, count_buf);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaDeviceSynchronize() );
uint count;
cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost));
@@ -942,7 +942,7 @@ namespace cv { namespace gpu { namespace mathfunc
countNonZeroPass2Kernel<256, T><<<1, 256>>>(count_buf, grid.x * grid.y);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaDeviceSynchronize() );
uint count;
cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost));
@@ -1493,7 +1493,7 @@ namespace cv { namespace gpu { namespace mathfunc
break;
}
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaDeviceSynchronize() );
R result[4] = {0, 0, 0, 0};
cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
@@ -1543,7 +1543,7 @@ namespace cv { namespace gpu { namespace mathfunc
}
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaDeviceSynchronize() );
R result[4] = {0, 0, 0, 0};
cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
@@ -1615,7 +1615,7 @@ namespace cv { namespace gpu { namespace mathfunc
break;
}
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaDeviceSynchronize() );
R result[4] = {0, 0, 0, 0};
cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
@@ -1665,7 +1665,7 @@ namespace cv { namespace gpu { namespace mathfunc
}
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaDeviceSynchronize() );
R result[4] = {0, 0, 0, 0};
cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
@@ -1737,7 +1737,7 @@ namespace cv { namespace gpu { namespace mathfunc
break;
}
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaDeviceSynchronize() );
R result[4] = {0, 0, 0, 0};
cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
@@ -1787,7 +1787,7 @@ namespace cv { namespace gpu { namespace mathfunc
}
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaDeviceSynchronize() );
R result[4] = {0, 0, 0, 0};
cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));

View File

@@ -236,7 +236,7 @@ namespace cv { namespace gpu { namespace split_merge {
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall(cudaDeviceSynchronize());
}
@@ -253,7 +253,7 @@ namespace cv { namespace gpu { namespace split_merge {
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall(cudaDeviceSynchronize());
}
@@ -271,7 +271,7 @@ namespace cv { namespace gpu { namespace split_merge {
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall(cudaDeviceSynchronize());
}
@@ -445,7 +445,7 @@ namespace cv { namespace gpu { namespace split_merge {
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall(cudaDeviceSynchronize());
}
@@ -462,7 +462,7 @@ namespace cv { namespace gpu { namespace split_merge {
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall(cudaDeviceSynchronize());
}
@@ -480,7 +480,7 @@ namespace cv { namespace gpu { namespace split_merge {
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall(cudaDeviceSynchronize());
}

View File

@@ -102,19 +102,19 @@ __device__ uint2 MinSSD(volatile unsigned int *col_ssd_cache, volatile unsigned
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
ssd[0] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 0 * (BLOCK_W + 2 * RADIUS));
__syncthreads();
__syncthreads();
ssd[1] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 1 * (BLOCK_W + 2 * RADIUS));
__syncthreads();
__syncthreads();
ssd[2] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 2 * (BLOCK_W + 2 * RADIUS));
__syncthreads();
__syncthreads();
ssd[3] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 3 * (BLOCK_W + 2 * RADIUS));
__syncthreads();
__syncthreads();
ssd[4] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 4 * (BLOCK_W + 2 * RADIUS));
__syncthreads();
__syncthreads();
ssd[5] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 5 * (BLOCK_W + 2 * RADIUS));
__syncthreads();
__syncthreads();
ssd[6] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 6 * (BLOCK_W + 2 * RADIUS));
__syncthreads();
__syncthreads();
ssd[7] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 7 * (BLOCK_W + 2 * RADIUS));
int mssd = min(min(min(ssd[0], ssd[1]), min(ssd[4], ssd[5])), min(min(ssd[2], ssd[3]), min(ssd[6], ssd[7])));
@@ -327,8 +327,8 @@ template<int RADIUS> void kernel_caller(const DevMem2D& left, const DevMem2D& ri
stereoKernel<RADIUS><<<grid, threads, smem_size, stream>>>(left.data, right.data, left.step, disp, maxdisp);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
};
typedef void (*kernel_caller_t)(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int maxdisp, cudaStream_t & stream);
@@ -407,7 +407,7 @@ extern "C" void prefilter_xsobel(const DevMem2D& input, const DevMem2D& output,
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaDeviceSynchronize() );
cudaSafeCall( cudaUnbindTexture (texForSobel ) );
}
@@ -531,10 +531,10 @@ extern "C" void postfilter_textureness(const DevMem2D& input, int winsz, float a
textureness_kernel<<<grid, threads, smem_size, stream>>>(disp, winsz, avgTexturenessThreshold);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaUnbindTexture (texForTF) );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
cudaSafeCall( cudaUnbindTexture (texForTF) );
}
}}}

View File

@@ -175,7 +175,7 @@ namespace cv { namespace gpu { namespace bp
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaDeviceSynchronize() );
}
template <> void comp_data_gpu<uchar, float>(const DevMem2D& left, const DevMem2D& right, const DevMem2D& data, cudaStream_t stream)
{
@@ -189,7 +189,7 @@ namespace cv { namespace gpu { namespace bp
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaDeviceSynchronize() );
}
template <> void comp_data_gpu<uchar3, short>(const DevMem2D& left, const DevMem2D& right, const DevMem2D& data, cudaStream_t stream)
@@ -204,7 +204,7 @@ namespace cv { namespace gpu { namespace bp
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaDeviceSynchronize() );
}
template <> void comp_data_gpu<uchar3, float>(const DevMem2D& left, const DevMem2D& right, const DevMem2D& data, cudaStream_t stream)
{
@@ -218,7 +218,7 @@ namespace cv { namespace gpu { namespace bp
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaDeviceSynchronize() );
}
template <> void comp_data_gpu<uchar4, short>(const DevMem2D& left, const DevMem2D& right, const DevMem2D& data, cudaStream_t stream)
@@ -233,7 +233,7 @@ namespace cv { namespace gpu { namespace bp
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaDeviceSynchronize() );
}
template <> void comp_data_gpu<uchar4, float>(const DevMem2D& left, const DevMem2D& right, const DevMem2D& data, cudaStream_t stream)
{
@@ -247,7 +247,7 @@ namespace cv { namespace gpu { namespace bp
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaDeviceSynchronize() );
}
///////////////////////////////////////////////////////////////
@@ -287,7 +287,7 @@ namespace cv { namespace gpu { namespace bp
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaDeviceSynchronize() );
}
template void data_step_down_gpu<short>(int dst_cols, int dst_rows, int src_rows, const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
@@ -337,7 +337,7 @@ namespace cv { namespace gpu { namespace bp
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaDeviceSynchronize() );
}
template void level_up_messages_gpu<short>(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2D* mus, DevMem2D* mds, DevMem2D* mls, DevMem2D* mrs, cudaStream_t stream);
@@ -457,7 +457,7 @@ namespace cv { namespace gpu { namespace bp
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaDeviceSynchronize() );
}
}
@@ -520,7 +520,7 @@ namespace cv { namespace gpu { namespace bp
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaDeviceSynchronize() );
}
template void output_gpu<short>(const DevMem2D& u, const DevMem2D& d, const DevMem2D& l, const DevMem2D& r, const DevMem2D& data, const DevMem2D_<short>& disp, cudaStream_t stream);

View File

@@ -385,7 +385,7 @@ namespace cv { namespace gpu { namespace csbp
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaDeviceSynchronize() );
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
@@ -401,7 +401,7 @@ namespace cv { namespace gpu { namespace csbp
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaDeviceSynchronize() );
}
template void init_data_cost(int rows, int cols, short* disp_selected_pyr, short* data_cost_selected, size_t msg_step,
@@ -586,7 +586,7 @@ namespace cv { namespace gpu { namespace csbp
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaDeviceSynchronize() );
}
template void compute_data_cost(const short* disp_selected_pyr, short* data_cost, size_t msg_step1, size_t msg_step2,
@@ -713,7 +713,7 @@ namespace cv { namespace gpu { namespace csbp
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaDeviceSynchronize() );
}
@@ -815,7 +815,7 @@ namespace cv { namespace gpu { namespace csbp
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaDeviceSynchronize() );
}
};
@@ -885,7 +885,7 @@ namespace cv { namespace gpu { namespace csbp
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaDeviceSynchronize() );
}
template void compute_disp(const short* u, const short* d, const short* l, const short* r, const short* data_cost_selected, const short* disp_selected, size_t msg_step,

View File

@@ -181,7 +181,7 @@ namespace cv { namespace gpu { namespace surf
icvCalcLayerDetAndTrace<<<grid, threads>>>(det, trace);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaDeviceSynchronize() );
}
////////////////////////////////////////////////////////////////////////
@@ -338,7 +338,7 @@ namespace cv { namespace gpu { namespace surf
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaDeviceSynchronize() );
}
////////////////////////////////////////////////////////////////////////
@@ -483,7 +483,7 @@ namespace cv { namespace gpu { namespace surf
icvInterpolateKeypoint<<<grid, threads>>>(det, maxPosBuffer, featureX, featureY, featureLaplacian, featureSize, featureHessian, featureCounter);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaDeviceSynchronize() );
}
////////////////////////////////////////////////////////////////////////
@@ -674,7 +674,7 @@ namespace cv { namespace gpu { namespace surf
icvCalcOrientation<<<grid, threads>>>(featureX, featureY, featureSize, featureDir);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaDeviceSynchronize() );
}
////////////////////////////////////////////////////////////////////////
@@ -986,24 +986,24 @@ namespace cv { namespace gpu { namespace surf
compute_descriptors64<<<dim3(nFeatures, 16, 1), dim3(6, 6, 1)>>>(descriptors, featureX, featureY, featureSize, featureDir);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaDeviceSynchronize() );
normalize_descriptors<64><<<dim3(nFeatures, 1, 1), dim3(64, 1, 1)>>>(descriptors);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaDeviceSynchronize() );
}
else
{
compute_descriptors128<<<dim3(nFeatures, 16, 1), dim3(6, 6, 1)>>>(descriptors, featureX, featureY, featureSize, featureDir);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaDeviceSynchronize() );
normalize_descriptors<128><<<dim3(nFeatures, 1, 1), dim3(128, 1, 1)>>>(descriptors);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaDeviceSynchronize() );
}
}
}}}