modified according to CUDA 4.0 API updates
This commit is contained in:
@@ -68,19 +68,22 @@ namespace cv { namespace gpu
|
||||
|
||||
template <typename T>
|
||||
void blendLinearCaller(int rows, int cols, int cn, const PtrStep_<T> img1, const PtrStep_<T> img2,
|
||||
const PtrStepf weights1, const PtrStepf weights2, PtrStep_<T> result)
|
||||
const PtrStepf weights1, const PtrStepf weights2, PtrStep_<T> result, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(16, 16);
|
||||
dim3 grid(divUp(cols * cn, threads.x), divUp(rows, threads.y));
|
||||
|
||||
blendLinearKernel<<<grid, threads>>>(rows, cols * cn, cn, img1, img2, weights1, weights2, result);
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
blendLinearKernel<<<grid, threads, 0, stream>>>(rows, cols * cn, cn, img1, img2, weights1, weights2, result);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
template void blendLinearCaller<uchar>(int, int, int, const PtrStep, const PtrStep,
|
||||
const PtrStepf, const PtrStepf, PtrStep);
|
||||
const PtrStepf, const PtrStepf, PtrStep, cudaStream_t stream);
|
||||
template void blendLinearCaller<float>(int, int, int, const PtrStepf, const PtrStepf,
|
||||
const PtrStepf, const PtrStepf, PtrStepf);
|
||||
const PtrStepf, const PtrStepf, PtrStepf, cudaStream_t stream);
|
||||
|
||||
|
||||
__global__ void blendLinearKernel8UC4(int rows, int cols, const PtrStep img1, const PtrStep img2,
|
||||
@@ -105,13 +108,16 @@ namespace cv { namespace gpu
|
||||
|
||||
|
||||
void blendLinearCaller8UC4(int rows, int cols, const PtrStep img1, const PtrStep img2,
|
||||
const PtrStepf weights1, const PtrStepf weights2, PtrStep result)
|
||||
const PtrStepf weights1, const PtrStepf weights2, PtrStep result, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(16, 16);
|
||||
dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
|
||||
|
||||
blendLinearKernel8UC4<<<grid, threads>>>(rows, cols, img1, img2, weights1, weights2, result);
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
blendLinearKernel8UC4<<<grid, threads, 0, stream>>>(rows, cols, img1, img2, weights1, weights2, result);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
}}
|
||||
@@ -589,7 +589,7 @@ namespace cv { namespace gpu { namespace bfmatcher
|
||||
|
||||
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Train, typename Mask>
|
||||
void matchSimple_caller(const DevMem2D_<T>& queryDescs, const Train& train,
|
||||
const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)
|
||||
const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, cudaStream_t stream)
|
||||
{
|
||||
StaticAssert<BLOCK_DIM_Y <= 64>::check(); // blockDimY vals must reduce by warp
|
||||
|
||||
@@ -597,14 +597,15 @@ namespace cv { namespace gpu { namespace bfmatcher
|
||||
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
|
||||
|
||||
match<BLOCK_DIM_X, BLOCK_DIM_Y, ReduceDescCalculatorSimple<BLOCK_DIM_X, T>, Dist, T>
|
||||
<<<grid, threads>>>(queryDescs, train, mask, trainIdx.data, imgIdx.data, distance.data);
|
||||
<<<grid, threads, 0, stream>>>(queryDescs, train, mask, trainIdx.data, imgIdx.data, distance.data);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int MAX_DESCRIPTORS_LEN, bool DESC_LEN_EQ_MAX_LEN, typename Dist, typename T, typename Train, typename Mask>
|
||||
void matchCached_caller(const DevMem2D_<T>& queryDescs, const Train& train,
|
||||
const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)
|
||||
const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, cudaStream_t stream)
|
||||
{
|
||||
StaticAssert<BLOCK_DIM_Y <= 64>::check(); // blockDimY vals must reduce by warp
|
||||
StaticAssert<BLOCK_DIM_X * BLOCK_DIM_Y >= MAX_DESCRIPTORS_LEN>::check(); // block size must be greter than descriptors length
|
||||
@@ -614,10 +615,11 @@ namespace cv { namespace gpu { namespace bfmatcher
|
||||
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
|
||||
|
||||
match<BLOCK_DIM_X, BLOCK_DIM_Y, ReduceDescCalculatorCached<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, DESC_LEN_EQ_MAX_LEN, T, typename Dist::ValueType>, Dist, T>
|
||||
<<<grid, threads>>>(queryDescs, train, mask, trainIdx.data, imgIdx.data, distance.data);
|
||||
<<<grid, threads, 0, stream>>>(queryDescs, train, mask, trainIdx.data, imgIdx.data, distance.data);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
@@ -626,167 +628,165 @@ namespace cv { namespace gpu { namespace bfmatcher
|
||||
template <typename Dist, typename T, typename Train, typename Mask>
|
||||
void matchDispatcher(const DevMem2D_<T>& queryDescs, const Train& train,
|
||||
const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
|
||||
bool cc_12)
|
||||
bool cc_12, cudaStream_t stream)
|
||||
{
|
||||
if (queryDescs.cols < 64)
|
||||
matchCached_caller<16, 16, 64, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);
|
||||
matchCached_caller<16, 16, 64, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance, stream);
|
||||
else if (queryDescs.cols == 64)
|
||||
matchCached_caller<16, 16, 64, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);
|
||||
matchCached_caller<16, 16, 64, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance, stream);
|
||||
else if (queryDescs.cols < 128)
|
||||
matchCached_caller<16, 16, 128, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);
|
||||
matchCached_caller<16, 16, 128, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance, stream);
|
||||
else if (queryDescs.cols == 128)
|
||||
matchCached_caller<16, 16, 128, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);
|
||||
matchCached_caller<16, 16, 128, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance, stream);
|
||||
else if (queryDescs.cols < 256)
|
||||
matchCached_caller<16, 16, 256, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);
|
||||
matchCached_caller<16, 16, 256, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance, stream);
|
||||
else if (queryDescs.cols == 256 && cc_12)
|
||||
matchCached_caller<16, 16, 256, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);
|
||||
matchCached_caller<16, 16, 256, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance, stream);
|
||||
else
|
||||
matchSimple_caller<16, 16, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);
|
||||
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
matchSimple_caller<16, 16, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance, stream);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs,
|
||||
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
|
||||
bool cc_12)
|
||||
bool cc_12, cudaStream_t stream)
|
||||
{
|
||||
SingleTrain<T> train((DevMem2D_<T>)trainDescs);
|
||||
if (mask.data)
|
||||
{
|
||||
SingleMask m(mask);
|
||||
matchDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12);
|
||||
matchDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12, stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
matchDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);
|
||||
matchDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12, stream);
|
||||
}
|
||||
}
|
||||
|
||||
template void matchSingleL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
template void matchSingleL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
template void matchSingleL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
template void matchSingleL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
template void matchSingleL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
template void matchSingleL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
template void matchSingleL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
|
||||
template void matchSingleL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
|
||||
template void matchSingleL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
|
||||
template void matchSingleL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
|
||||
template void matchSingleL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
|
||||
template void matchSingleL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
|
||||
|
||||
template <typename T>
|
||||
void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs,
|
||||
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
|
||||
bool cc_12)
|
||||
bool cc_12, cudaStream_t stream)
|
||||
{
|
||||
SingleTrain<T> train((DevMem2D_<T>)trainDescs);
|
||||
if (mask.data)
|
||||
{
|
||||
SingleMask m(mask);
|
||||
matchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12);
|
||||
matchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12, stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
matchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);
|
||||
matchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12, stream);
|
||||
}
|
||||
}
|
||||
|
||||
template void matchSingleL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
template void matchSingleL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
template void matchSingleL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
template void matchSingleL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
template void matchSingleL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
template void matchSingleL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
template void matchSingleL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
|
||||
template void matchSingleL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
|
||||
template void matchSingleL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
|
||||
template void matchSingleL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
|
||||
template void matchSingleL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
|
||||
template void matchSingleL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
|
||||
|
||||
template <typename T>
|
||||
void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs,
|
||||
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
|
||||
bool cc_12)
|
||||
bool cc_12, cudaStream_t stream)
|
||||
{
|
||||
SingleTrain<T> train((DevMem2D_<T>)trainDescs);
|
||||
if (mask.data)
|
||||
{
|
||||
SingleMask m(mask);
|
||||
matchDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12);
|
||||
matchDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12, stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
matchDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);
|
||||
matchDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12, stream);
|
||||
}
|
||||
}
|
||||
|
||||
template void matchSingleHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
template void matchSingleHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
template void matchSingleHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
template void matchSingleHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
template void matchSingleHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
template void matchSingleHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
|
||||
template void matchSingleHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
|
||||
template void matchSingleHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
|
||||
template void matchSingleHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
|
||||
template void matchSingleHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
|
||||
|
||||
template <typename T>
|
||||
void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection,
|
||||
const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx,
|
||||
const DevMem2Df& distance, bool cc_12)
|
||||
const DevMem2Df& distance, bool cc_12, cudaStream_t stream)
|
||||
{
|
||||
TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, queryDescs.cols);
|
||||
if (maskCollection.data)
|
||||
{
|
||||
MaskCollection mask(maskCollection.data);
|
||||
matchDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12);
|
||||
matchDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12, stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
matchDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);
|
||||
matchDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12, stream);
|
||||
}
|
||||
}
|
||||
|
||||
template void matchCollectionL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
template void matchCollectionL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
template void matchCollectionL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
template void matchCollectionL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
template void matchCollectionL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
template void matchCollectionL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
template void matchCollectionL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
|
||||
template void matchCollectionL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
|
||||
template void matchCollectionL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
|
||||
template void matchCollectionL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
|
||||
template void matchCollectionL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
|
||||
template void matchCollectionL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
|
||||
|
||||
template <typename T>
|
||||
void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection,
|
||||
const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx,
|
||||
const DevMem2Df& distance, bool cc_12)
|
||||
const DevMem2Df& distance, bool cc_12, cudaStream_t stream)
|
||||
{
|
||||
TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, queryDescs.cols);
|
||||
if (maskCollection.data)
|
||||
{
|
||||
MaskCollection mask(maskCollection.data);
|
||||
matchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12);
|
||||
matchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12, stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
matchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);
|
||||
matchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12, stream);
|
||||
}
|
||||
}
|
||||
|
||||
template void matchCollectionL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
template void matchCollectionL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
template void matchCollectionL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
template void matchCollectionL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
template void matchCollectionL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
template void matchCollectionL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
template void matchCollectionL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
|
||||
template void matchCollectionL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
|
||||
template void matchCollectionL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
|
||||
template void matchCollectionL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
|
||||
template void matchCollectionL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
|
||||
template void matchCollectionL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
|
||||
|
||||
template <typename T>
|
||||
void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection,
|
||||
const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx,
|
||||
const DevMem2Df& distance, bool cc_12)
|
||||
const DevMem2Df& distance, bool cc_12, cudaStream_t stream)
|
||||
{
|
||||
TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, queryDescs.cols);
|
||||
if (maskCollection.data)
|
||||
{
|
||||
MaskCollection mask(maskCollection.data);
|
||||
matchDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12);
|
||||
matchDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12, stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
matchDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);
|
||||
matchDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12, stream);
|
||||
}
|
||||
}
|
||||
|
||||
template void matchCollectionHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
template void matchCollectionHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
template void matchCollectionHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
template void matchCollectionHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
template void matchCollectionHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
template void matchCollectionHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
|
||||
template void matchCollectionHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
|
||||
template void matchCollectionHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
|
||||
template void matchCollectionHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
|
||||
template void matchCollectionHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////
|
||||
//////////////////////////////////// Knn Match ////////////////////////////////////
|
||||
@@ -833,16 +833,17 @@ namespace cv { namespace gpu { namespace bfmatcher
|
||||
|
||||
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
|
||||
void calcDistance_caller(const DevMem2D_<T>& queryDescs, const DevMem2D_<T>& trainDescs,
|
||||
const Mask& mask, const DevMem2Df& distance)
|
||||
const Mask& mask, const DevMem2Df& distance, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
|
||||
dim3 grid(queryDescs.rows, divUp(trainDescs.rows, BLOCK_DIM_Y), 1);
|
||||
|
||||
calcDistance<BLOCK_DIM_X, BLOCK_DIM_Y, Dist, T><<<grid, threads>>>(
|
||||
calcDistance<BLOCK_DIM_X, BLOCK_DIM_Y, Dist, T><<<grid, threads, 0, stream>>>(
|
||||
queryDescs, trainDescs, mask, distance);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
@@ -1010,105 +1011,106 @@ namespace cv { namespace gpu { namespace bfmatcher
|
||||
// find knn match kernel caller
|
||||
|
||||
template <int BLOCK_SIZE>
|
||||
void findKnnMatch_caller(int knn, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist)
|
||||
void findKnnMatch_caller(int knn, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(BLOCK_SIZE, 1, 1);
|
||||
dim3 grid(trainIdx.rows, 1, 1);
|
||||
|
||||
for (int i = 0; i < knn; ++i)
|
||||
{
|
||||
findBestMatch<BLOCK_SIZE><<<grid, threads>>>(allDist, i, trainIdx, distance);
|
||||
findBestMatch<BLOCK_SIZE><<<grid, threads, 0, stream>>>(allDist, i, trainIdx, distance);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
}
|
||||
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// knn match caller
|
||||
|
||||
template <typename Dist, typename T, typename Mask>
|
||||
void calcDistanceDispatcher(const DevMem2D_<T>& queryDescs, const DevMem2D_<T>& trainDescs, const Mask& mask, const DevMem2Df& allDist)
|
||||
void calcDistanceDispatcher(const DevMem2D_<T>& queryDescs, const DevMem2D_<T>& trainDescs, const Mask& mask, const DevMem2Df& allDist, cudaStream_t stream)
|
||||
{
|
||||
calcDistance_caller<16, 16, Dist>(queryDescs, trainDescs, mask, allDist);
|
||||
calcDistance_caller<16, 16, Dist>(queryDescs, trainDescs, mask, allDist, stream);
|
||||
}
|
||||
|
||||
void findKnnMatchDispatcher(int knn, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist)
|
||||
void findKnnMatchDispatcher(int knn, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream)
|
||||
{
|
||||
findKnnMatch_caller<256>(knn, trainIdx, distance, allDist);
|
||||
findKnnMatch_caller<256>(knn, trainIdx, distance, allDist, stream);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,
|
||||
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist)
|
||||
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream)
|
||||
{
|
||||
if (mask.data)
|
||||
{
|
||||
calcDistanceDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, SingleMask(mask), allDist);
|
||||
calcDistanceDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, SingleMask(mask), allDist, stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
calcDistanceDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, WithOutMask(), allDist);
|
||||
calcDistanceDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, WithOutMask(), allDist, stream);
|
||||
}
|
||||
|
||||
findKnnMatchDispatcher(knn, trainIdx, distance, allDist);
|
||||
findKnnMatchDispatcher(knn, trainIdx, distance, allDist, stream);
|
||||
}
|
||||
|
||||
template void knnMatchL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
|
||||
template void knnMatchL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
|
||||
template void knnMatchL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
|
||||
template void knnMatchL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
|
||||
template void knnMatchL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
|
||||
template void knnMatchL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
|
||||
template void knnMatchL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);
|
||||
template void knnMatchL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);
|
||||
template void knnMatchL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);
|
||||
template void knnMatchL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);
|
||||
template void knnMatchL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);
|
||||
template void knnMatchL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);
|
||||
|
||||
template <typename T>
|
||||
void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,
|
||||
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist)
|
||||
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream)
|
||||
{
|
||||
if (mask.data)
|
||||
{
|
||||
calcDistanceDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
|
||||
SingleMask(mask), allDist);
|
||||
SingleMask(mask), allDist, stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
calcDistanceDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
|
||||
WithOutMask(), allDist);
|
||||
WithOutMask(), allDist, stream);
|
||||
}
|
||||
|
||||
findKnnMatchDispatcher(knn, trainIdx, distance, allDist);
|
||||
findKnnMatchDispatcher(knn, trainIdx, distance, allDist, stream);
|
||||
}
|
||||
|
||||
template void knnMatchL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
|
||||
template void knnMatchL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
|
||||
template void knnMatchL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
|
||||
template void knnMatchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
|
||||
template void knnMatchL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
|
||||
template void knnMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
|
||||
template void knnMatchL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);
|
||||
template void knnMatchL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);
|
||||
template void knnMatchL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);
|
||||
template void knnMatchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);
|
||||
template void knnMatchL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);
|
||||
template void knnMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);
|
||||
|
||||
template <typename T>
|
||||
void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,
|
||||
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist)
|
||||
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream)
|
||||
{
|
||||
if (mask.data)
|
||||
{
|
||||
calcDistanceDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
|
||||
SingleMask(mask), allDist);
|
||||
SingleMask(mask), allDist, stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
calcDistanceDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
|
||||
WithOutMask(), allDist);
|
||||
WithOutMask(), allDist, stream);
|
||||
}
|
||||
|
||||
findKnnMatchDispatcher(knn, trainIdx, distance, allDist);
|
||||
findKnnMatchDispatcher(knn, trainIdx, distance, allDist, stream);
|
||||
}
|
||||
|
||||
template void knnMatchHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
|
||||
template void knnMatchHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
|
||||
template void knnMatchHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
|
||||
template void knnMatchHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
|
||||
template void knnMatchHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
|
||||
template void knnMatchHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);
|
||||
template void knnMatchHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);
|
||||
template void knnMatchHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);
|
||||
template void knnMatchHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);
|
||||
template void knnMatchHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////
|
||||
/////////////////////////////////// Radius Match //////////////////////////////////
|
||||
@@ -1166,16 +1168,17 @@ namespace cv { namespace gpu { namespace bfmatcher
|
||||
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
|
||||
void radiusMatch_caller(const DevMem2D_<T>& queryDescs, const DevMem2D_<T>& trainDescs,
|
||||
float maxDistance, const Mask& mask, const DevMem2Di& trainIdx, unsigned int* nMatches,
|
||||
const DevMem2Df& distance)
|
||||
const DevMem2Df& distance, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
|
||||
dim3 grid(queryDescs.rows, divUp(trainDescs.rows, BLOCK_DIM_Y), 1);
|
||||
|
||||
radiusMatch<BLOCK_DIM_X, BLOCK_DIM_Y, Dist, T><<<grid, threads>>>(
|
||||
radiusMatch<BLOCK_DIM_X, BLOCK_DIM_Y, Dist, T><<<grid, threads, 0, stream>>>(
|
||||
queryDescs, trainDescs, maxDistance, mask, trainIdx, nMatches, distance);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
@@ -1184,77 +1187,77 @@ namespace cv { namespace gpu { namespace bfmatcher
|
||||
template <typename Dist, typename T, typename Mask>
|
||||
void radiusMatchDispatcher(const DevMem2D_<T>& queryDescs, const DevMem2D_<T>& trainDescs,
|
||||
float maxDistance, const Mask& mask, const DevMem2Di& trainIdx, unsigned int* nMatches,
|
||||
const DevMem2Df& distance)
|
||||
const DevMem2Df& distance, cudaStream_t stream)
|
||||
{
|
||||
radiusMatch_caller<16, 16, Dist>(queryDescs, trainDescs, maxDistance, mask,
|
||||
trainIdx, nMatches, distance);
|
||||
trainIdx, nMatches, distance, stream);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,
|
||||
const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance)
|
||||
const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream)
|
||||
{
|
||||
if (mask.data)
|
||||
{
|
||||
radiusMatchDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
|
||||
maxDistance, SingleMask(mask), trainIdx, nMatches, distance);
|
||||
maxDistance, SingleMask(mask), trainIdx, nMatches, distance, stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
radiusMatchDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
|
||||
maxDistance, WithOutMask(), trainIdx, nMatches, distance);
|
||||
maxDistance, WithOutMask(), trainIdx, nMatches, distance, stream);
|
||||
}
|
||||
}
|
||||
|
||||
template void radiusMatchL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
|
||||
template void radiusMatchL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
|
||||
template void radiusMatchL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
|
||||
template void radiusMatchL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
|
||||
template void radiusMatchL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
|
||||
template void radiusMatchL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
|
||||
template void radiusMatchL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
|
||||
template void radiusMatchL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
|
||||
template void radiusMatchL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
|
||||
template void radiusMatchL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
|
||||
template void radiusMatchL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
|
||||
template void radiusMatchL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
|
||||
|
||||
template <typename T>
|
||||
void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,
|
||||
const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance)
|
||||
const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream)
|
||||
{
|
||||
if (mask.data)
|
||||
{
|
||||
radiusMatchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
|
||||
maxDistance, SingleMask(mask), trainIdx, nMatches, distance);
|
||||
maxDistance, SingleMask(mask), trainIdx, nMatches, distance, stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
radiusMatchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
|
||||
maxDistance, WithOutMask(), trainIdx, nMatches, distance);
|
||||
maxDistance, WithOutMask(), trainIdx, nMatches, distance, stream);
|
||||
}
|
||||
}
|
||||
|
||||
template void radiusMatchL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
|
||||
template void radiusMatchL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
|
||||
template void radiusMatchL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
|
||||
template void radiusMatchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
|
||||
template void radiusMatchL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
|
||||
template void radiusMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
|
||||
template void radiusMatchL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
|
||||
template void radiusMatchL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
|
||||
template void radiusMatchL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
|
||||
template void radiusMatchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
|
||||
template void radiusMatchL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
|
||||
template void radiusMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
|
||||
|
||||
template <typename T>
|
||||
void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,
|
||||
const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance)
|
||||
const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream)
|
||||
{
|
||||
if (mask.data)
|
||||
{
|
||||
radiusMatchDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
|
||||
maxDistance, SingleMask(mask), trainIdx, nMatches, distance);
|
||||
maxDistance, SingleMask(mask), trainIdx, nMatches, distance, stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
radiusMatchDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
|
||||
maxDistance, WithOutMask(), trainIdx, nMatches, distance);
|
||||
maxDistance, WithOutMask(), trainIdx, nMatches, distance, stream);
|
||||
}
|
||||
}
|
||||
|
||||
template void radiusMatchHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
|
||||
template void radiusMatchHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
|
||||
template void radiusMatchHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
|
||||
template void radiusMatchHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
|
||||
template void radiusMatchHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
|
||||
template void radiusMatchHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
|
||||
template void radiusMatchHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
|
||||
template void radiusMatchHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
|
||||
template void radiusMatchHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
|
||||
template void radiusMatchHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
|
||||
}}}
|
||||
|
||||
@@ -184,7 +184,9 @@ namespace cv { namespace gpu
|
||||
|
||||
computeHypothesisScoresKernel<<<grid, threads, smem_size>>>(
|
||||
num_points, object, image, dist_threshold, hypothesis_scores);
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
} // namespace solvepnp_ransac
|
||||
|
||||
|
||||
@@ -64,19 +64,19 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
};
|
||||
|
||||
template <typename T1, typename T2>
|
||||
inline void compare_ne(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst)
|
||||
inline void compare_ne(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream)
|
||||
{
|
||||
NotEqual<T1, T2> op;
|
||||
transform(static_cast< DevMem2D_<T1> >(src1), static_cast< DevMem2D_<T2> >(src2), dst, op, 0);
|
||||
transform(static_cast< DevMem2D_<T1> >(src1), static_cast< DevMem2D_<T2> >(src2), dst, op, stream);
|
||||
}
|
||||
|
||||
void compare_ne_8uc4(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst)
|
||||
void compare_ne_8uc4(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream)
|
||||
{
|
||||
compare_ne<uint, uint>(src1, src2, dst);
|
||||
compare_ne<uint, uint>(src1, src2, dst, stream);
|
||||
}
|
||||
void compare_ne_32f(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst)
|
||||
void compare_ne_32f(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream)
|
||||
{
|
||||
compare_ne<float, float>(src1, src2, dst);
|
||||
compare_ne<float, float>(src1, src2, dst, stream);
|
||||
}
|
||||
|
||||
|
||||
@@ -133,7 +133,7 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
|
||||
@@ -165,7 +165,7 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
|
||||
@@ -256,7 +256,7 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
|
||||
@@ -290,7 +290,7 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -93,9 +93,9 @@ namespace filter_krnls
|
||||
typedef typename SmemType<T>::smem_t smem_t;
|
||||
|
||||
__shared__ smem_t smem[BLOCK_DIM_Y * BLOCK_DIM_X * 3];
|
||||
|
||||
const int x = BLOCK_DIM_X * blockIdx.x + threadIdx.x;
|
||||
const int y = BLOCK_DIM_Y * blockIdx.y + threadIdx.y;
|
||||
|
||||
const int x = BLOCK_DIM_X * blockIdx.x + threadIdx.x;
|
||||
const int y = BLOCK_DIM_Y * blockIdx.y + threadIdx.y;
|
||||
|
||||
smem_t* sDataRow = smem + threadIdx.y * BLOCK_DIM_X * 3;
|
||||
|
||||
@@ -129,7 +129,7 @@ namespace filter_krnls
|
||||
namespace cv { namespace gpu { namespace filters
|
||||
{
|
||||
template <int ksize, typename T, typename D, template<typename> class B>
|
||||
void linearRowFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor)
|
||||
void linearRowFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);
|
||||
dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
|
||||
@@ -143,16 +143,17 @@ namespace cv { namespace gpu { namespace filters
|
||||
"try bigger image or another border extrapolation mode", __FILE__, __LINE__);
|
||||
}
|
||||
|
||||
filter_krnls::linearRowFilter<ksize, T, D><<<grid, threads>>>(src, dst, anchor, b);
|
||||
filter_krnls::linearRowFilter<ksize, T, D><<<grid, threads, 0, stream>>>(src, dst, anchor, b);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template <typename T, typename D>
|
||||
void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type)
|
||||
void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream)
|
||||
{
|
||||
typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor);
|
||||
typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream);
|
||||
static const caller_t callers[3][17] =
|
||||
{
|
||||
{
|
||||
@@ -173,7 +174,7 @@ namespace cv { namespace gpu { namespace filters
|
||||
linearRowFilter_caller<14, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<15, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<16, T, D, BrdRowReflect101>,
|
||||
},
|
||||
},
|
||||
{
|
||||
0,
|
||||
linearRowFilter_caller<1 , T, D, BrdRowReplicate>,
|
||||
@@ -192,7 +193,7 @@ namespace cv { namespace gpu { namespace filters
|
||||
linearRowFilter_caller<14, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<15, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<16, T, D, BrdRowReplicate>,
|
||||
},
|
||||
},
|
||||
{
|
||||
0,
|
||||
linearRowFilter_caller<1 , T, D, BrdRowConstant>,
|
||||
@@ -216,15 +217,15 @@ namespace cv { namespace gpu { namespace filters
|
||||
|
||||
loadLinearKernel(kernel, ksize);
|
||||
|
||||
callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor);
|
||||
callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);
|
||||
}
|
||||
|
||||
template void linearRowFilter_gpu<uchar , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);
|
||||
template void linearRowFilter_gpu<uchar4, float4>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);
|
||||
template void linearRowFilter_gpu<short , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);;
|
||||
template void linearRowFilter_gpu<short2, float2>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);
|
||||
template void linearRowFilter_gpu<int , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);
|
||||
template void linearRowFilter_gpu<float , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);
|
||||
template void linearRowFilter_gpu<uchar , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearRowFilter_gpu<uchar4, float4>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearRowFilter_gpu<short , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearRowFilter_gpu<short2, float2>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearRowFilter_gpu<int , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearRowFilter_gpu<float , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
}}}
|
||||
|
||||
namespace filter_krnls
|
||||
@@ -233,9 +234,9 @@ namespace filter_krnls
|
||||
__global__ void linearColumnFilter(const DevMem2D_<T> src, PtrStep_<D> dst, int anchor, const B b)
|
||||
{
|
||||
__shared__ T smem[BLOCK_DIM_Y * BLOCK_DIM_X * 3];
|
||||
|
||||
const int x = BLOCK_DIM_X * blockIdx.x + threadIdx.x;
|
||||
const int y = BLOCK_DIM_Y * blockIdx.y + threadIdx.y;
|
||||
|
||||
const int x = BLOCK_DIM_X * blockIdx.x + threadIdx.x;
|
||||
const int y = BLOCK_DIM_Y * blockIdx.y + threadIdx.y;
|
||||
|
||||
T* sDataColumn = smem + threadIdx.x;
|
||||
|
||||
@@ -269,7 +270,7 @@ namespace filter_krnls
|
||||
namespace cv { namespace gpu { namespace filters
|
||||
{
|
||||
template <int ksize, typename T, typename D, template<typename> class B>
|
||||
void linearColumnFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor)
|
||||
void linearColumnFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);
|
||||
dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
|
||||
@@ -282,16 +283,17 @@ namespace cv { namespace gpu { namespace filters
|
||||
"try bigger image or another border extrapolation mode", __FILE__, __LINE__);
|
||||
}
|
||||
|
||||
filter_krnls::linearColumnFilter<ksize, T, D><<<grid, threads>>>(src, dst, anchor, b);
|
||||
filter_krnls::linearColumnFilter<ksize, T, D><<<grid, threads, 0, stream>>>(src, dst, anchor, b);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template <typename T, typename D>
|
||||
void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type)
|
||||
void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream)
|
||||
{
|
||||
typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor);
|
||||
typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream);
|
||||
static const caller_t callers[3][17] =
|
||||
{
|
||||
{
|
||||
@@ -312,7 +314,7 @@ namespace cv { namespace gpu { namespace filters
|
||||
linearColumnFilter_caller<14, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<15, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<16, T, D, BrdColReflect101>,
|
||||
},
|
||||
},
|
||||
{
|
||||
0,
|
||||
linearColumnFilter_caller<1 , T, D, BrdColReplicate>,
|
||||
@@ -331,7 +333,7 @@ namespace cv { namespace gpu { namespace filters
|
||||
linearColumnFilter_caller<14, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<15, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<16, T, D, BrdColReplicate>,
|
||||
},
|
||||
},
|
||||
{
|
||||
0,
|
||||
linearColumnFilter_caller<1 , T, D, BrdColConstant>,
|
||||
@@ -355,15 +357,15 @@ namespace cv { namespace gpu { namespace filters
|
||||
|
||||
loadLinearKernel(kernel, ksize);
|
||||
|
||||
callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor);
|
||||
callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);
|
||||
}
|
||||
|
||||
template void linearColumnFilter_gpu<float , uchar >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);
|
||||
template void linearColumnFilter_gpu<float4, uchar4>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);
|
||||
template void linearColumnFilter_gpu<float , short >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);
|
||||
template void linearColumnFilter_gpu<float2, short2>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);
|
||||
template void linearColumnFilter_gpu<float , int >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);
|
||||
template void linearColumnFilter_gpu<float , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);
|
||||
template void linearColumnFilter_gpu<float , uchar >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearColumnFilter_gpu<float4, uchar4>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearColumnFilter_gpu<float , short >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearColumnFilter_gpu<float2, short2>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearColumnFilter_gpu<float , int >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearColumnFilter_gpu<float , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
}}}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@@ -390,10 +392,10 @@ namespace cv { namespace gpu { namespace bf
|
||||
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space, &table_space.data, sizeof(table_space.data)) );
|
||||
size_t table_space_step = table_space.step / sizeof(float);
|
||||
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space_step, &table_space_step, sizeof(size_t)) );
|
||||
|
||||
|
||||
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cndisp, &ndisp, sizeof(int)) );
|
||||
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cradius, &radius, sizeof(int)) );
|
||||
|
||||
|
||||
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cedge_disc, &edge_disc, sizeof(short)) );
|
||||
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cmax_disc, &max_disc, sizeof(short)) );
|
||||
}
|
||||
@@ -538,10 +540,10 @@ namespace cv { namespace gpu { namespace bf
|
||||
break;
|
||||
default:
|
||||
cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);
|
||||
}
|
||||
}
|
||||
|
||||
if (stream != 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
void bilateral_filter_gpu(const DevMem2D& disp, const DevMem2D& img, int channels, int iters, cudaStream_t stream)
|
||||
|
||||
@@ -220,7 +220,7 @@ void compute_hists(int nbins, int block_stride_x, int block_stride_y,
|
||||
img_block_width, grad, qangle, scale, block_hists);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
|
||||
@@ -324,7 +324,7 @@ void normalize_hists(int nbins, int block_stride_x, int block_stride_y,
|
||||
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
|
||||
@@ -418,7 +418,7 @@ void classify_hists(int win_height, int win_width, int block_stride_y, int block
|
||||
block_hists, coefs, free_coef, threshold, labels);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
@@ -463,7 +463,7 @@ void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, i
|
||||
img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
|
||||
@@ -512,7 +512,7 @@ void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, i
|
||||
img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
@@ -636,7 +636,8 @@ void compute_gradients_8UC4(int nbins, int height, int width, const DevMem2D& im
|
||||
compute_gradients_8UC4_kernel<nthreads, 0><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);
|
||||
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template <int nthreads, int correct_gamma>
|
||||
@@ -707,7 +708,8 @@ void compute_gradients_8UC1(int nbins, int height, int width, const DevMem2D& im
|
||||
compute_gradients_8UC1_kernel<nthreads, 0><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);
|
||||
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
|
||||
@@ -765,7 +767,9 @@ static void resize_for_hog(const DevMem2D& src, DevMem2D dst, TEX& tex)
|
||||
|
||||
resize_for_hog_kernel<<<grid, threads>>>(sx, sy, (DevMem2D_<T>)dst, colOfs);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
|
||||
cudaSafeCall( cudaUnbindTexture(tex) );
|
||||
}
|
||||
|
||||
|
||||
@@ -139,7 +139,7 @@ namespace cv { namespace gpu { namespace imgproc
|
||||
remap_1c<<<grid, threads>>>(xmap.data, ymap.data, xmap.step, dst.data, dst.step, dst.cols, dst.rows);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
cudaSafeCall( cudaUnbindTexture(tex_remap) );
|
||||
}
|
||||
|
||||
@@ -153,7 +153,7 @@ namespace cv { namespace gpu { namespace imgproc
|
||||
remap_3c<<<grid, threads>>>(src.data, src.step, xmap.data, ymap.data, xmap.step, dst.data, dst.step, dst.cols, dst.rows);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
/////////////////////////////////// MeanShiftfiltering ///////////////////////////////////////////////
|
||||
@@ -263,7 +263,7 @@ namespace cv { namespace gpu { namespace imgproc
|
||||
meanshift_kernel<<< grid, threads >>>( dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );
|
||||
}
|
||||
extern "C" void meanShiftProc_gpu(const DevMem2D& src, DevMem2D dstr, DevMem2D dstsp, int sp, int sr, int maxIter, float eps)
|
||||
@@ -279,7 +279,7 @@ namespace cv { namespace gpu { namespace imgproc
|
||||
meanshiftproc_kernel<<< grid, threads >>>( dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps );
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );
|
||||
}
|
||||
|
||||
@@ -397,7 +397,7 @@ namespace cv { namespace gpu { namespace imgproc
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
void drawColorDisp_gpu(const DevMem2D_<short>& src, const DevMem2D& dst, int ndisp, const cudaStream_t& stream)
|
||||
@@ -411,7 +411,7 @@ namespace cv { namespace gpu { namespace imgproc
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
/////////////////////////////////// reprojectImageTo3D ///////////////////////////////////////////////
|
||||
@@ -462,7 +462,7 @@ namespace cv { namespace gpu { namespace imgproc
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
void reprojectImageTo3D_gpu(const DevMem2D& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream)
|
||||
@@ -502,7 +502,7 @@ namespace cv { namespace gpu { namespace imgproc
|
||||
extractCovData_kernel<<<grid, threads>>>(Dx.cols, Dx.rows, Dx, Dy, dst);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
/////////////////////////////////////////// Corner Harris /////////////////////////////////////////////////
|
||||
@@ -611,7 +611,8 @@ namespace cv { namespace gpu { namespace imgproc
|
||||
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
|
||||
cudaSafeCall(cudaUnbindTexture(harrisDxTex));
|
||||
cudaSafeCall(cudaUnbindTexture(harrisDyTex));
|
||||
}
|
||||
@@ -727,7 +728,8 @@ namespace cv { namespace gpu { namespace imgproc
|
||||
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall(cudaDeviceSynchronize());
|
||||
|
||||
cudaSafeCall(cudaUnbindTexture(minEigenValDxTex));
|
||||
cudaSafeCall(cudaUnbindTexture(minEigenValDyTex));
|
||||
}
|
||||
@@ -763,7 +765,7 @@ namespace cv { namespace gpu { namespace imgproc
|
||||
column_sumKernel_32F<<<grid, threads>>>(src.cols, src.rows, src, dst);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
@@ -791,7 +793,7 @@ namespace cv { namespace gpu { namespace imgproc
|
||||
mulSpectrumsKernel<<<grid, threads>>>(a, b, c);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
@@ -820,7 +822,7 @@ namespace cv { namespace gpu { namespace imgproc
|
||||
mulSpectrumsKernel_CONJ<<<grid, threads>>>(a, b, c);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
@@ -850,7 +852,7 @@ namespace cv { namespace gpu { namespace imgproc
|
||||
mulAndScaleSpectrumsKernel<<<grid, threads>>>(a, b, scale, c);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
@@ -880,7 +882,7 @@ namespace cv { namespace gpu { namespace imgproc
|
||||
mulAndScaleSpectrumsKernel_CONJ<<<grid, threads>>>(a, b, scale, c);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
@@ -904,7 +906,9 @@ namespace cv { namespace gpu { namespace imgproc
|
||||
dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
|
||||
|
||||
downsampleKernel<<<grid, threads>>>(src, rows, cols, k, dst);
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template void downsampleCaller(const PtrStep src, int rows, int cols, int k, PtrStep dst);
|
||||
|
||||
@@ -46,6 +46,8 @@
|
||||
#include "opencv2/gpu/devmem2d.hpp"
|
||||
#include "safe_call.hpp"
|
||||
#include "cuda_runtime.h"
|
||||
#include "npp.h"
|
||||
#include "NPP_staging.hpp"
|
||||
|
||||
namespace cv
|
||||
{
|
||||
@@ -106,6 +108,41 @@ namespace cv
|
||||
cudaSafeCall( cudaGetTextureReference(&tex, name) );
|
||||
cudaSafeCall( cudaUnbindTexture(tex) );
|
||||
}
|
||||
|
||||
class NppStreamHandler
|
||||
{
|
||||
public:
|
||||
inline explicit NppStreamHandler(cudaStream_t newStream = 0)
|
||||
{
|
||||
oldStream = nppGetStream();
|
||||
nppSetStream(newStream);
|
||||
}
|
||||
|
||||
inline ~NppStreamHandler()
|
||||
{
|
||||
nppSetStream(oldStream);
|
||||
}
|
||||
|
||||
private:
|
||||
cudaStream_t oldStream;
|
||||
};
|
||||
|
||||
class NppStStreamHandler
|
||||
{
|
||||
public:
|
||||
inline explicit NppStStreamHandler(cudaStream_t newStream = 0)
|
||||
{
|
||||
oldStream = nppStSetActiveCUDAstream(newStream);
|
||||
}
|
||||
|
||||
inline ~NppStStreamHandler()
|
||||
{
|
||||
nppStSetActiveCUDAstream(oldStream);
|
||||
}
|
||||
|
||||
private:
|
||||
cudaStream_t oldStream;
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -134,7 +134,7 @@ void matchTemplateNaive_CCORR_32F(const DevMem2D image, const DevMem2D templ,
|
||||
}
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
|
||||
@@ -165,7 +165,7 @@ void matchTemplateNaive_CCORR_8U(const DevMem2D image, const DevMem2D templ,
|
||||
}
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
|
||||
@@ -228,7 +228,7 @@ void matchTemplateNaive_SQDIFF_32F(const DevMem2D image, const DevMem2D templ,
|
||||
}
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
|
||||
@@ -259,7 +259,7 @@ void matchTemplateNaive_SQDIFF_8U(const DevMem2D image, const DevMem2D templ,
|
||||
}
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
|
||||
@@ -309,7 +309,7 @@ void matchTemplatePrepared_SQDIFF_8U(
|
||||
}
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
|
||||
@@ -360,7 +360,7 @@ void matchTemplatePrepared_SQDIFF_NORMED_8U(
|
||||
}
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
|
||||
@@ -392,7 +392,7 @@ void matchTemplatePrepared_CCOFF_8U(
|
||||
w, h, (float)templ_sum / (w * h), image_sum, result);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
|
||||
@@ -434,7 +434,7 @@ void matchTemplatePrepared_CCOFF_8UC2(
|
||||
image_sum_r, image_sum_g, result);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
|
||||
@@ -490,7 +490,7 @@ void matchTemplatePrepared_CCOFF_8UC3(
|
||||
image_sum_r, image_sum_g, image_sum_b, result);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
|
||||
@@ -556,7 +556,7 @@ void matchTemplatePrepared_CCOFF_8UC4(
|
||||
result);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
|
||||
@@ -602,7 +602,7 @@ void matchTemplatePrepared_CCOFF_NORMED_8U(
|
||||
image_sum, image_sqsum, result);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
|
||||
@@ -665,7 +665,7 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC2(
|
||||
result);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
|
||||
@@ -742,7 +742,7 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC3(
|
||||
result);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
|
||||
@@ -833,7 +833,7 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC4(
|
||||
result);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
|
||||
@@ -877,7 +877,7 @@ void normalize_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum,
|
||||
}
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
|
||||
@@ -919,7 +919,7 @@ void extractFirstChannel_32F(const DevMem2D image, DevMem2Df result, int cn)
|
||||
}
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -153,7 +153,7 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
void cartToPolar_gpu(const DevMem2Df& x, const DevMem2Df& y, const DevMem2Df& mag, bool magSqr, const DevMem2Df& angle, bool angleInDegrees, cudaStream_t stream)
|
||||
@@ -202,7 +202,7 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
void polarToCart_gpu(const DevMem2Df& mag, const DevMem2Df& angle, const DevMem2Df& x, const DevMem2Df& y, bool angleInDegrees, cudaStream_t stream)
|
||||
|
||||
@@ -87,7 +87,7 @@ namespace cv { namespace gpu { namespace matrix_operations {
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall ( cudaThreadSynchronize() );
|
||||
cudaSafeCall ( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
void copy_to_with_mask(const DevMem2D& mat_src, DevMem2D mat_dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream)
|
||||
@@ -199,7 +199,7 @@ namespace cv { namespace gpu { namespace matrix_operations {
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall ( cudaThreadSynchronize() );
|
||||
cudaSafeCall ( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template void set_to_gpu<uchar >(const DevMem2D& mat, const uchar* scalar, const DevMem2D& mask, int channels, cudaStream_t stream);
|
||||
@@ -222,7 +222,7 @@ namespace cv { namespace gpu { namespace matrix_operations {
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall ( cudaThreadSynchronize() );
|
||||
cudaSafeCall ( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template void set_to_gpu<uchar >(const DevMem2D& mat, const uchar* scalar, int channels, cudaStream_t stream);
|
||||
|
||||
@@ -275,11 +275,11 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
minMaxKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
|
||||
T minval_, maxval_;
|
||||
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
|
||||
cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
|
||||
cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
|
||||
cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
|
||||
*minval = minval_;
|
||||
*maxval = maxval_;
|
||||
}
|
||||
@@ -306,11 +306,11 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
minMaxKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
|
||||
T minval_, maxval_;
|
||||
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
|
||||
cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
|
||||
cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
|
||||
cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
|
||||
*minval = minval_;
|
||||
*maxval = maxval_;
|
||||
}
|
||||
@@ -363,11 +363,11 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
minMaxPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall(cudaDeviceSynchronize());
|
||||
|
||||
T minval_, maxval_;
|
||||
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
|
||||
cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
|
||||
cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
|
||||
cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
|
||||
*minval = minval_;
|
||||
*maxval = maxval_;
|
||||
}
|
||||
@@ -395,11 +395,11 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
minMaxPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
|
||||
T minval_, maxval_;
|
||||
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
|
||||
cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
|
||||
cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
|
||||
cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
|
||||
*minval = minval_;
|
||||
*maxval = maxval_;
|
||||
}
|
||||
@@ -609,17 +609,17 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
minloc_buf, maxloc_buf);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
|
||||
T minval_, maxval_;
|
||||
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
|
||||
cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
|
||||
cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
|
||||
cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
|
||||
*minval = minval_;
|
||||
*maxval = maxval_;
|
||||
|
||||
uint minloc_, maxloc_;
|
||||
cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
|
||||
cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
|
||||
cudaSafeCall( cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost) );
|
||||
cudaSafeCall( cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost) );
|
||||
minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
|
||||
maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
|
||||
}
|
||||
@@ -650,7 +650,7 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
minloc_buf, maxloc_buf);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
|
||||
T minval_, maxval_;
|
||||
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
|
||||
@@ -724,7 +724,7 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
|
||||
T minval_, maxval_;
|
||||
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
|
||||
@@ -766,7 +766,7 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
|
||||
T minval_, maxval_;
|
||||
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
|
||||
@@ -895,7 +895,7 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
countNonZeroKernel<256, T><<<grid, threads>>>(src, count_buf);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
|
||||
uint count;
|
||||
cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost));
|
||||
@@ -942,7 +942,7 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
countNonZeroPass2Kernel<256, T><<<1, 256>>>(count_buf, grid.x * grid.y);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
|
||||
uint count;
|
||||
cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost));
|
||||
@@ -1493,7 +1493,7 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
|
||||
break;
|
||||
}
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
|
||||
R result[4] = {0, 0, 0, 0};
|
||||
cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
|
||||
@@ -1543,7 +1543,7 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
}
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
|
||||
R result[4] = {0, 0, 0, 0};
|
||||
cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
|
||||
@@ -1615,7 +1615,7 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
|
||||
break;
|
||||
}
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
|
||||
R result[4] = {0, 0, 0, 0};
|
||||
cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
|
||||
@@ -1665,7 +1665,7 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
}
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
|
||||
R result[4] = {0, 0, 0, 0};
|
||||
cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
|
||||
@@ -1737,7 +1737,7 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
|
||||
break;
|
||||
}
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
|
||||
R result[4] = {0, 0, 0, 0};
|
||||
cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
|
||||
@@ -1787,7 +1787,7 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
}
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
|
||||
R result[4] = {0, 0, 0, 0};
|
||||
cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
|
||||
|
||||
@@ -236,7 +236,7 @@ namespace cv { namespace gpu { namespace split_merge {
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
|
||||
@@ -253,7 +253,7 @@ namespace cv { namespace gpu { namespace split_merge {
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
|
||||
@@ -271,7 +271,7 @@ namespace cv { namespace gpu { namespace split_merge {
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
|
||||
@@ -445,7 +445,7 @@ namespace cv { namespace gpu { namespace split_merge {
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
|
||||
@@ -462,7 +462,7 @@ namespace cv { namespace gpu { namespace split_merge {
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
|
||||
@@ -480,7 +480,7 @@ namespace cv { namespace gpu { namespace split_merge {
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -102,19 +102,19 @@ __device__ uint2 MinSSD(volatile unsigned int *col_ssd_cache, volatile unsigned
|
||||
|
||||
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
|
||||
ssd[0] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 0 * (BLOCK_W + 2 * RADIUS));
|
||||
__syncthreads();
|
||||
__syncthreads();
|
||||
ssd[1] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 1 * (BLOCK_W + 2 * RADIUS));
|
||||
__syncthreads();
|
||||
__syncthreads();
|
||||
ssd[2] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 2 * (BLOCK_W + 2 * RADIUS));
|
||||
__syncthreads();
|
||||
__syncthreads();
|
||||
ssd[3] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 3 * (BLOCK_W + 2 * RADIUS));
|
||||
__syncthreads();
|
||||
__syncthreads();
|
||||
ssd[4] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 4 * (BLOCK_W + 2 * RADIUS));
|
||||
__syncthreads();
|
||||
__syncthreads();
|
||||
ssd[5] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 5 * (BLOCK_W + 2 * RADIUS));
|
||||
__syncthreads();
|
||||
__syncthreads();
|
||||
ssd[6] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 6 * (BLOCK_W + 2 * RADIUS));
|
||||
__syncthreads();
|
||||
__syncthreads();
|
||||
ssd[7] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 7 * (BLOCK_W + 2 * RADIUS));
|
||||
|
||||
int mssd = min(min(min(ssd[0], ssd[1]), min(ssd[4], ssd[5])), min(min(ssd[2], ssd[3]), min(ssd[6], ssd[7])));
|
||||
@@ -327,8 +327,8 @@ template<int RADIUS> void kernel_caller(const DevMem2D& left, const DevMem2D& ri
|
||||
stereoKernel<RADIUS><<<grid, threads, smem_size, stream>>>(left.data, right.data, left.step, disp, maxdisp);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
};
|
||||
|
||||
typedef void (*kernel_caller_t)(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int maxdisp, cudaStream_t & stream);
|
||||
@@ -407,7 +407,7 @@ extern "C" void prefilter_xsobel(const DevMem2D& input, const DevMem2D& output,
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
|
||||
cudaSafeCall( cudaUnbindTexture (texForSobel ) );
|
||||
}
|
||||
@@ -531,10 +531,10 @@ extern "C" void postfilter_textureness(const DevMem2D& input, int winsz, float a
|
||||
textureness_kernel<<<grid, threads, smem_size, stream>>>(disp, winsz, avgTexturenessThreshold);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
cudaSafeCall( cudaUnbindTexture (texForTF) );
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
|
||||
cudaSafeCall( cudaUnbindTexture (texForTF) );
|
||||
}
|
||||
|
||||
}}}
|
||||
|
||||
@@ -175,7 +175,7 @@ namespace cv { namespace gpu { namespace bp
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
template <> void comp_data_gpu<uchar, float>(const DevMem2D& left, const DevMem2D& right, const DevMem2D& data, cudaStream_t stream)
|
||||
{
|
||||
@@ -189,7 +189,7 @@ namespace cv { namespace gpu { namespace bp
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template <> void comp_data_gpu<uchar3, short>(const DevMem2D& left, const DevMem2D& right, const DevMem2D& data, cudaStream_t stream)
|
||||
@@ -204,7 +204,7 @@ namespace cv { namespace gpu { namespace bp
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
template <> void comp_data_gpu<uchar3, float>(const DevMem2D& left, const DevMem2D& right, const DevMem2D& data, cudaStream_t stream)
|
||||
{
|
||||
@@ -218,7 +218,7 @@ namespace cv { namespace gpu { namespace bp
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template <> void comp_data_gpu<uchar4, short>(const DevMem2D& left, const DevMem2D& right, const DevMem2D& data, cudaStream_t stream)
|
||||
@@ -233,7 +233,7 @@ namespace cv { namespace gpu { namespace bp
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
template <> void comp_data_gpu<uchar4, float>(const DevMem2D& left, const DevMem2D& right, const DevMem2D& data, cudaStream_t stream)
|
||||
{
|
||||
@@ -247,7 +247,7 @@ namespace cv { namespace gpu { namespace bp
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
@@ -287,7 +287,7 @@ namespace cv { namespace gpu { namespace bp
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template void data_step_down_gpu<short>(int dst_cols, int dst_rows, int src_rows, const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
|
||||
@@ -337,7 +337,7 @@ namespace cv { namespace gpu { namespace bp
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template void level_up_messages_gpu<short>(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2D* mus, DevMem2D* mds, DevMem2D* mls, DevMem2D* mrs, cudaStream_t stream);
|
||||
@@ -457,7 +457,7 @@ namespace cv { namespace gpu { namespace bp
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
}
|
||||
|
||||
@@ -520,7 +520,7 @@ namespace cv { namespace gpu { namespace bp
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template void output_gpu<short>(const DevMem2D& u, const DevMem2D& d, const DevMem2D& l, const DevMem2D& r, const DevMem2D& data, const DevMem2D_<short>& disp, cudaStream_t stream);
|
||||
|
||||
@@ -385,7 +385,7 @@ namespace cv { namespace gpu { namespace csbp
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
|
||||
dim3 threads(32, 8, 1);
|
||||
dim3 grid(1, 1, 1);
|
||||
@@ -401,7 +401,7 @@ namespace cv { namespace gpu { namespace csbp
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template void init_data_cost(int rows, int cols, short* disp_selected_pyr, short* data_cost_selected, size_t msg_step,
|
||||
@@ -586,7 +586,7 @@ namespace cv { namespace gpu { namespace csbp
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template void compute_data_cost(const short* disp_selected_pyr, short* data_cost, size_t msg_step1, size_t msg_step2,
|
||||
@@ -713,7 +713,7 @@ namespace cv { namespace gpu { namespace csbp
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
|
||||
@@ -815,7 +815,7 @@ namespace cv { namespace gpu { namespace csbp
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
};
|
||||
|
||||
@@ -885,7 +885,7 @@ namespace cv { namespace gpu { namespace csbp
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template void compute_disp(const short* u, const short* d, const short* l, const short* r, const short* data_cost_selected, const short* disp_selected, size_t msg_step,
|
||||
|
||||
@@ -181,7 +181,7 @@ namespace cv { namespace gpu { namespace surf
|
||||
icvCalcLayerDetAndTrace<<<grid, threads>>>(det, trace);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
@@ -338,7 +338,7 @@ namespace cv { namespace gpu { namespace surf
|
||||
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
@@ -483,7 +483,7 @@ namespace cv { namespace gpu { namespace surf
|
||||
icvInterpolateKeypoint<<<grid, threads>>>(det, maxPosBuffer, featureX, featureY, featureLaplacian, featureSize, featureHessian, featureCounter);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
@@ -674,7 +674,7 @@ namespace cv { namespace gpu { namespace surf
|
||||
icvCalcOrientation<<<grid, threads>>>(featureX, featureY, featureSize, featureDir);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
@@ -986,24 +986,24 @@ namespace cv { namespace gpu { namespace surf
|
||||
compute_descriptors64<<<dim3(nFeatures, 16, 1), dim3(6, 6, 1)>>>(descriptors, featureX, featureY, featureSize, featureDir);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
|
||||
normalize_descriptors<64><<<dim3(nFeatures, 1, 1), dim3(64, 1, 1)>>>(descriptors);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
else
|
||||
{
|
||||
compute_descriptors128<<<dim3(nFeatures, 16, 1), dim3(6, 6, 1)>>>(descriptors, featureX, featureY, featureSize, featureDir);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
|
||||
normalize_descriptors<128><<<dim3(nFeatures, 1, 1), dim3(128, 1, 1)>>>(descriptors);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
}
|
||||
}}}
|
||||
|
||||
Reference in New Issue
Block a user