minor gpu module refactoring: split big .cu files, disabled unnecessary template instantiation
This commit is contained in:
parent
d99f4a2beb
commit
be8e31f14d
@ -756,12 +756,6 @@ namespace cv
|
||||
//! computes the proximity map for the raster template and the image where the template is searched for
|
||||
CV_EXPORTS void matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method);
|
||||
|
||||
//! downsamples image
|
||||
CV_EXPORTS void downsample(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());
|
||||
|
||||
//! upsamples image
|
||||
CV_EXPORTS void upsample(const GpuMat& src, GpuMat &dst, Stream& stream = Stream::Null());
|
||||
|
||||
//! smoothes the source image and downsamples it
|
||||
CV_EXPORTS void pyrDown(const GpuMat& src, GpuMat& dst, int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null());
|
||||
|
||||
|
@ -3,7 +3,7 @@
|
||||
PERF_TEST_P(DevInfo_Size_MatType_KernelSize, boxFilter, testing::Combine(testing::ValuesIn(devices()),
|
||||
testing::Values(GPU_TYPICAL_MAT_SIZES),
|
||||
testing::Values(CV_8UC1, CV_8UC4),
|
||||
testing::Values(3, 5, 7)))
|
||||
testing::Values(3, 5)))
|
||||
{
|
||||
DeviceInfo devInfo = std::tr1::get<0>(GetParam());
|
||||
Size size = std::tr1::get<1>(GetParam());
|
||||
@ -37,7 +37,7 @@ PERF_TEST_P(DevInfo_Size_MatType_MorphOp_KernelSize, morphologyFilter, testing::
|
||||
testing::Values(GPU_TYPICAL_MAT_SIZES),
|
||||
testing::Values(CV_8UC1, CV_8UC4),
|
||||
testing::Values((int)MORPH_ERODE, (int)MORPH_DILATE),
|
||||
testing::Values(3, 5, 7)))
|
||||
testing::Values(3, 5)))
|
||||
{
|
||||
DeviceInfo devInfo = std::tr1::get<0>(GetParam());
|
||||
Size size = std::tr1::get<1>(GetParam());
|
||||
@ -71,7 +71,7 @@ PERF_TEST_P(DevInfo_Size_MatType_MorphOp_KernelSize, morphologyFilter, testing::
|
||||
PERF_TEST_P(DevInfo_Size_MatType_KernelSize, linearFilter, testing::Combine(testing::ValuesIn(devices()),
|
||||
testing::Values(GPU_TYPICAL_MAT_SIZES),
|
||||
testing::Values(CV_8UC1, CV_8UC4),
|
||||
testing::Values(3, 5, 7)))
|
||||
testing::Values(3, 5)))
|
||||
{
|
||||
DeviceInfo devInfo = std::tr1::get<0>(GetParam());
|
||||
Size size = std::tr1::get<1>(GetParam());
|
||||
@ -103,8 +103,8 @@ PERF_TEST_P(DevInfo_Size_MatType_KernelSize, linearFilter, testing::Combine(test
|
||||
|
||||
PERF_TEST_P(DevInfo_Size_MatType_KernelSize_BorderMode, separableLinearFilter, testing::Combine(testing::ValuesIn(devices()),
|
||||
testing::Values(GPU_TYPICAL_MAT_SIZES),
|
||||
testing::Values(CV_8UC1, CV_8UC4, CV_16SC1, CV_16SC3, CV_32FC1),
|
||||
testing::Values(3, 5, 7),
|
||||
testing::Values(CV_8UC1, CV_8UC4, CV_16SC3, CV_32FC1),
|
||||
testing::Values(3, 5),
|
||||
testing::Values((int)BORDER_REFLECT101, (int)BORDER_CONSTANT)))
|
||||
{
|
||||
DeviceInfo devInfo = std::tr1::get<0>(GetParam());
|
||||
|
@ -244,8 +244,8 @@ PERF_TEST_P(DevInfo_Size_MatType, threshold, testing::Combine(testing::ValuesIn(
|
||||
}
|
||||
|
||||
PERF_TEST_P(DevInfo_Size_MatType_Interpolation_SizeCoeff, resize, testing::Combine(testing::ValuesIn(devices()),
|
||||
testing::Values(GPU_TYPICAL_MAT_SIZES),
|
||||
testing::Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_16UC1, CV_16UC3, CV_16UC4, CV_32FC1, CV_32FC3, CV_32FC4),
|
||||
testing::Values(szSXGA, sz1080p),
|
||||
testing::Values(CV_8UC1, CV_8UC4, CV_16UC1, CV_32FC1),
|
||||
testing::Values((int)INTER_NEAREST, (int)INTER_LINEAR, (int)INTER_CUBIC),
|
||||
testing::Values(0.5, 2.0)))
|
||||
{
|
||||
|
@ -53,7 +53,8 @@ typedef TestBaseWithParam< std::tr1::tuple<DeviceInfo, int, int> > DevInfo_K_Des
|
||||
const cv::Size sz1800x1500 = cv::Size(1800, 1500);
|
||||
const cv::Size sz4700x3000 = cv::Size(4700, 3000);
|
||||
|
||||
#define GPU_TYPICAL_MAT_SIZES szXGA, szSXGA, sz720p, sz1080p, sz1800x1500, sz4700x3000
|
||||
//#define GPU_TYPICAL_MAT_SIZES szXGA, szSXGA, sz720p, sz1080p, sz1800x1500, sz4700x3000
|
||||
#define GPU_TYPICAL_MAT_SIZES szSXGA, sz1080p, sz4700x3000
|
||||
|
||||
//! read image from testdata folder.
|
||||
Mat readImage(const string& fileName, int flags = CV_LOAD_IMAGE_COLOR);
|
||||
|
@ -179,18 +179,18 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& queryDescs,
|
||||
static const match_caller_t match_callers[3][8] =
|
||||
{
|
||||
{
|
||||
matchSingleL1_gpu<unsigned char>, matchSingleL1_gpu<signed char>,
|
||||
matchSingleL1_gpu<unsigned char>, 0/*matchSingleL1_gpu<signed char>*/,
|
||||
matchSingleL1_gpu<unsigned short>, matchSingleL1_gpu<short>,
|
||||
matchSingleL1_gpu<int>, matchSingleL1_gpu<float>, 0, 0
|
||||
},
|
||||
{
|
||||
matchSingleL2_gpu<unsigned char>, matchSingleL2_gpu<signed char>,
|
||||
matchSingleL2_gpu<unsigned short>, matchSingleL2_gpu<short>,
|
||||
matchSingleL2_gpu<int>, matchSingleL2_gpu<float>, 0, 0
|
||||
0/*matchSingleL2_gpu<unsigned char>*/, 0/*matchSingleL2_gpu<signed char>*/,
|
||||
0/*matchSingleL2_gpu<unsigned short>*/, 0/*matchSingleL2_gpu<short>*/,
|
||||
0/*matchSingleL2_gpu<int>*/, matchSingleL2_gpu<float>, 0, 0
|
||||
},
|
||||
{
|
||||
matchSingleHamming_gpu<unsigned char>, matchSingleHamming_gpu<signed char>,
|
||||
matchSingleHamming_gpu<unsigned short>, matchSingleHamming_gpu<short>,
|
||||
matchSingleHamming_gpu<unsigned char>, 0/*matchSingleHamming_gpu<signed char>*/,
|
||||
matchSingleHamming_gpu<unsigned short>, 0/*matchSingleHamming_gpu<short>*/,
|
||||
matchSingleHamming_gpu<int>, 0, 0, 0
|
||||
}
|
||||
};
|
||||
@ -318,18 +318,18 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& queryDes
|
||||
static const match_caller_t match_callers[3][8] =
|
||||
{
|
||||
{
|
||||
matchCollectionL1_gpu<unsigned char>, matchCollectionL1_gpu<signed char>,
|
||||
matchCollectionL1_gpu<unsigned char>, 0/*matchCollectionL1_gpu<signed char>*/,
|
||||
matchCollectionL1_gpu<unsigned short>, matchCollectionL1_gpu<short>,
|
||||
matchCollectionL1_gpu<int>, matchCollectionL1_gpu<float>, 0, 0
|
||||
},
|
||||
{
|
||||
matchCollectionL2_gpu<unsigned char>, matchCollectionL2_gpu<signed char>,
|
||||
matchCollectionL2_gpu<unsigned short>, matchCollectionL2_gpu<short>,
|
||||
matchCollectionL2_gpu<int>, matchCollectionL2_gpu<float>, 0, 0
|
||||
0/*matchCollectionL2_gpu<unsigned char>*/, 0/*matchCollectionL2_gpu<signed char>*/,
|
||||
0/*matchCollectionL2_gpu<unsigned short>*/, 0/*matchCollectionL2_gpu<short>*/,
|
||||
0/*matchCollectionL2_gpu<int>*/, matchCollectionL2_gpu<float>, 0, 0
|
||||
},
|
||||
{
|
||||
matchCollectionHamming_gpu<unsigned char>, matchCollectionHamming_gpu<signed char>,
|
||||
matchCollectionHamming_gpu<unsigned short>, matchCollectionHamming_gpu<short>,
|
||||
matchCollectionHamming_gpu<unsigned char>, 0/*matchCollectionHamming_gpu<signed char>*/,
|
||||
matchCollectionHamming_gpu<unsigned short>, 0/*matchCollectionHamming_gpu<short>*/,
|
||||
matchCollectionHamming_gpu<int>, 0, 0, 0
|
||||
}
|
||||
};
|
||||
@ -427,16 +427,16 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, con
|
||||
static const match_caller_t match_callers[3][8] =
|
||||
{
|
||||
{
|
||||
knnMatchL1_gpu<unsigned char>, knnMatchL1_gpu<signed char>, knnMatchL1_gpu<unsigned short>,
|
||||
knnMatchL1_gpu<unsigned char>, 0/*knnMatchL1_gpu<signed char>*/, knnMatchL1_gpu<unsigned short>,
|
||||
knnMatchL1_gpu<short>, knnMatchL1_gpu<int>, knnMatchL1_gpu<float>, 0, 0
|
||||
},
|
||||
{
|
||||
knnMatchL2_gpu<unsigned char>, knnMatchL2_gpu<signed char>, knnMatchL2_gpu<unsigned short>,
|
||||
knnMatchL2_gpu<short>, knnMatchL2_gpu<int>, knnMatchL2_gpu<float>, 0, 0
|
||||
0/*knnMatchL2_gpu<unsigned char>*/, 0/*knnMatchL2_gpu<signed char>*/, 0/*knnMatchL2_gpu<unsigned short>*/,
|
||||
0/*knnMatchL2_gpu<short>*/, 0/*knnMatchL2_gpu<int>*/, knnMatchL2_gpu<float>, 0, 0
|
||||
},
|
||||
{
|
||||
knnMatchHamming_gpu<unsigned char>, knnMatchHamming_gpu<signed char>, knnMatchHamming_gpu<unsigned short>,
|
||||
knnMatchHamming_gpu<short>, knnMatchHamming_gpu<int>, 0, 0, 0
|
||||
knnMatchHamming_gpu<unsigned char>, 0/*knnMatchHamming_gpu<signed char>*/, knnMatchHamming_gpu<unsigned short>,
|
||||
0/*knnMatchHamming_gpu<short>*/, knnMatchHamming_gpu<int>, 0, 0, 0
|
||||
}
|
||||
};
|
||||
|
||||
@ -605,16 +605,16 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs,
|
||||
static const radiusMatch_caller_t radiusMatch_callers[3][8] =
|
||||
{
|
||||
{
|
||||
radiusMatchL1_gpu<unsigned char>, radiusMatchL1_gpu<signed char>, radiusMatchL1_gpu<unsigned short>,
|
||||
radiusMatchL1_gpu<unsigned char>, 0/*radiusMatchL1_gpu<signed char>*/, radiusMatchL1_gpu<unsigned short>,
|
||||
radiusMatchL1_gpu<short>, radiusMatchL1_gpu<int>, radiusMatchL1_gpu<float>, 0, 0
|
||||
},
|
||||
{
|
||||
radiusMatchL2_gpu<unsigned char>, radiusMatchL2_gpu<signed char>, radiusMatchL2_gpu<unsigned short>,
|
||||
radiusMatchL2_gpu<short>, radiusMatchL2_gpu<int>, radiusMatchL2_gpu<float>, 0, 0
|
||||
0/*radiusMatchL2_gpu<unsigned char>*/, 0/*radiusMatchL2_gpu<signed char>*/, 0/*radiusMatchL2_gpu<unsigned short>*/,
|
||||
0/*radiusMatchL2_gpu<short>*/, 0/*radiusMatchL2_gpu<int>*/, radiusMatchL2_gpu<float>, 0, 0
|
||||
},
|
||||
{
|
||||
radiusMatchHamming_gpu<unsigned char>, radiusMatchHamming_gpu<signed char>, radiusMatchHamming_gpu<unsigned short>,
|
||||
radiusMatchHamming_gpu<short>, radiusMatchHamming_gpu<int>, 0, 0, 0
|
||||
radiusMatchHamming_gpu<unsigned char>, 0/*radiusMatchHamming_gpu<signed char>*/, radiusMatchHamming_gpu<unsigned short>,
|
||||
0/*radiusMatchHamming_gpu<short>*/, radiusMatchHamming_gpu<int>, 0, 0, 0
|
||||
}
|
||||
};
|
||||
|
||||
|
464
modules/gpu/src/cuda/bf_knnmatch.cu
Normal file
464
modules/gpu/src/cuda/bf_knnmatch.cu
Normal file
@ -0,0 +1,464 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or bpied warranties, including, but not limited to, the bpied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#include "internal_shared.hpp"
|
||||
#include "opencv2/gpu/device/limits.hpp"
|
||||
#include "opencv2/gpu/device/vec_distance.hpp"
|
||||
|
||||
using namespace cv::gpu;
|
||||
using namespace cv::gpu::device;
|
||||
|
||||
namespace cv { namespace gpu { namespace bfmatcher
|
||||
{
|
||||
template <typename VecDiff, typename Dist, typename T, typename Mask>
|
||||
__device__ void distanceCalcLoop(const PtrStep_<T>& query, const DevMem2D_<T>& train, const Mask& m, int queryIdx,
|
||||
typename Dist::result_type& distMin1, typename Dist::result_type& distMin2, int& bestTrainIdx1, int& bestTrainIdx2,
|
||||
typename Dist::result_type* smem)
|
||||
{
|
||||
const VecDiff vecDiff(query.ptr(queryIdx), train.cols, (typename Dist::value_type*)smem, threadIdx.y * blockDim.x + threadIdx.x, threadIdx.x);
|
||||
|
||||
typename Dist::result_type* sdiffRow = smem + blockDim.x * threadIdx.y;
|
||||
|
||||
distMin1 = numeric_limits<typename Dist::result_type>::max();
|
||||
distMin2 = numeric_limits<typename Dist::result_type>::max();
|
||||
|
||||
bestTrainIdx1 = -1;
|
||||
bestTrainIdx2 = -1;
|
||||
|
||||
for (int trainIdx = threadIdx.y; trainIdx < train.rows; trainIdx += blockDim.y)
|
||||
{
|
||||
if (m(queryIdx, trainIdx))
|
||||
{
|
||||
Dist dist;
|
||||
|
||||
const T* trainRow = train.ptr(trainIdx);
|
||||
|
||||
vecDiff.calc(trainRow, train.cols, dist, sdiffRow, threadIdx.x);
|
||||
|
||||
const typename Dist::result_type val = dist;
|
||||
|
||||
if (val < distMin1)
|
||||
{
|
||||
distMin1 = val;
|
||||
bestTrainIdx1 = trainIdx;
|
||||
}
|
||||
else if (val < distMin2)
|
||||
{
|
||||
distMin2 = val;
|
||||
bestTrainIdx2 = trainIdx;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename VecDiff, typename Dist, typename T, typename Mask>
|
||||
__global__ void knnMatch2(const PtrStep_<T> query, const DevMem2D_<T> train, const Mask m, int2* trainIdx, float2* distance)
|
||||
{
|
||||
typedef typename Dist::result_type result_type;
|
||||
typedef typename Dist::value_type value_type;
|
||||
|
||||
__shared__ result_type smem[BLOCK_DIM_X * BLOCK_DIM_Y];
|
||||
|
||||
const int queryIdx = blockIdx.x;
|
||||
|
||||
result_type distMin1;
|
||||
result_type distMin2;
|
||||
|
||||
int bestTrainIdx1;
|
||||
int bestTrainIdx2;
|
||||
|
||||
distanceCalcLoop<VecDiff, Dist>(query, train, m, queryIdx, distMin1, distMin2, bestTrainIdx1, bestTrainIdx2, smem);
|
||||
__syncthreads();
|
||||
|
||||
volatile result_type* sdistMinRow = smem;
|
||||
volatile int* sbestTrainIdxRow = (int*)(sdistMinRow + 2 * BLOCK_DIM_Y);
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
sdistMinRow[threadIdx.y] = distMin1;
|
||||
sdistMinRow[threadIdx.y + BLOCK_DIM_Y] = distMin2;
|
||||
|
||||
sbestTrainIdxRow[threadIdx.y] = bestTrainIdx1;
|
||||
sbestTrainIdxRow[threadIdx.y + BLOCK_DIM_Y] = bestTrainIdx2;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x == 0 && threadIdx.y == 0)
|
||||
{
|
||||
distMin1 = numeric_limits<result_type>::max();
|
||||
distMin2 = numeric_limits<result_type>::max();
|
||||
|
||||
bestTrainIdx1 = -1;
|
||||
bestTrainIdx2 = -1;
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < BLOCK_DIM_Y; ++i)
|
||||
{
|
||||
result_type val = sdistMinRow[i];
|
||||
|
||||
if (val < distMin1)
|
||||
{
|
||||
distMin1 = val;
|
||||
bestTrainIdx1 = sbestTrainIdxRow[i];
|
||||
}
|
||||
else if (val < distMin2)
|
||||
{
|
||||
distMin2 = val;
|
||||
bestTrainIdx2 = sbestTrainIdxRow[i];
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i = BLOCK_DIM_Y; i < 2 * BLOCK_DIM_Y; ++i)
|
||||
{
|
||||
result_type val = sdistMinRow[i];
|
||||
|
||||
if (val < distMin2)
|
||||
{
|
||||
distMin2 = val;
|
||||
bestTrainIdx2 = sbestTrainIdxRow[i];
|
||||
}
|
||||
}
|
||||
|
||||
trainIdx[queryIdx] = make_int2(bestTrainIdx1, bestTrainIdx2);
|
||||
distance[queryIdx] = make_float2(distMin1, distMin2);
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Knn 2 Match kernel caller
|
||||
|
||||
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
|
||||
void knnMatch2Simple_caller(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
|
||||
const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance,
|
||||
cudaStream_t stream)
|
||||
{
|
||||
const dim3 grid(query.rows, 1, 1);
|
||||
const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
|
||||
|
||||
knnMatch2<BLOCK_DIM_X, BLOCK_DIM_Y, VecDiffGlobal<BLOCK_DIM_X, T>, Dist, T>
|
||||
<<<grid, threads, 0, stream>>>(query, train, mask, trainIdx, distance);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename Dist, typename T, typename Mask>
|
||||
void knnMatch2Cached_caller(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
|
||||
const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance,
|
||||
cudaStream_t stream)
|
||||
{
|
||||
StaticAssert<BLOCK_DIM_X * BLOCK_DIM_Y >= MAX_LEN>::check(); // block size must be greter than descriptors length
|
||||
StaticAssert<MAX_LEN % BLOCK_DIM_X == 0>::check(); // max descriptors length must divide to blockDimX
|
||||
|
||||
const dim3 grid(query.rows, 1, 1);
|
||||
const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
|
||||
|
||||
knnMatch2<BLOCK_DIM_X, BLOCK_DIM_Y, VecDiffCachedRegister<BLOCK_DIM_X, MAX_LEN, LEN_EQ_MAX_LEN, typename Dist::value_type>, Dist, T>
|
||||
<<<grid, threads, 0, stream>>>(query, train, mask, trainIdx.data, distance.data);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Knn 2 Match Dispatcher
|
||||
|
||||
template <typename Dist, typename T, typename Mask>
|
||||
void knnMatch2Dispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
|
||||
const DevMem2D& trainIdx, const DevMem2D& distance,
|
||||
int cc, cudaStream_t stream)
|
||||
{
|
||||
if (query.cols < 64)
|
||||
{
|
||||
knnMatch2Cached_caller<16, 16, 64, false, Dist>(
|
||||
query, train, mask,
|
||||
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance),
|
||||
stream);
|
||||
}
|
||||
else if (query.cols == 64)
|
||||
{
|
||||
knnMatch2Cached_caller<16, 16, 64, true, Dist>(
|
||||
query, train, mask,
|
||||
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance),
|
||||
stream);
|
||||
}
|
||||
else if (query.cols < 128)
|
||||
{
|
||||
knnMatch2Cached_caller<16, 16, 128, false, Dist>(
|
||||
query, train, mask,
|
||||
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance),
|
||||
stream);
|
||||
}
|
||||
else if (query.cols == 128 && cc >= 12)
|
||||
{
|
||||
knnMatch2Cached_caller<16, 16, 128, true, Dist>(
|
||||
query, train, mask,
|
||||
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance),
|
||||
stream);
|
||||
}
|
||||
else if (query.cols < 256 && cc >= 12)
|
||||
{
|
||||
knnMatch2Cached_caller<16, 16, 256, false, Dist>(
|
||||
query, train, mask,
|
||||
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance),
|
||||
stream);
|
||||
}
|
||||
else if (query.cols == 256 && cc >= 12)
|
||||
{
|
||||
knnMatch2Cached_caller<16, 16, 256, true, Dist>(
|
||||
query, train, mask,
|
||||
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance),
|
||||
stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
knnMatch2Simple_caller<16, 16, Dist>(
|
||||
query, train, mask,
|
||||
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance),
|
||||
stream);
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Calc distance kernel
|
||||
|
||||
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
|
||||
__global__ void calcDistance(const PtrStep_<T> query, const DevMem2D_<T> train, const Mask mask, PtrStepf distance)
|
||||
{
|
||||
__shared__ typename Dist::result_type sdiff[BLOCK_DIM_X * BLOCK_DIM_Y];
|
||||
|
||||
typename Dist::result_type* sdiff_row = sdiff + BLOCK_DIM_X * threadIdx.y;
|
||||
|
||||
const int queryIdx = blockIdx.x;
|
||||
const T* queryDescs = query.ptr(queryIdx);
|
||||
|
||||
const int trainIdx = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;
|
||||
|
||||
if (trainIdx < train.rows)
|
||||
{
|
||||
const T* trainDescs = train.ptr(trainIdx);
|
||||
|
||||
typename Dist::result_type myDist = numeric_limits<typename Dist::result_type>::max();
|
||||
|
||||
if (mask(queryIdx, trainIdx))
|
||||
{
|
||||
Dist dist;
|
||||
|
||||
calcVecDiffGlobal<BLOCK_DIM_X>(queryDescs, trainDescs, train.cols, dist, sdiff_row, threadIdx.x);
|
||||
|
||||
myDist = dist;
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
distance.ptr(queryIdx)[trainIdx] = myDist;
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Calc distance kernel caller
|
||||
|
||||
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
|
||||
void calcDistance_caller(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, const DevMem2Df& distance, cudaStream_t stream)
|
||||
{
|
||||
const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
|
||||
const dim3 grid(query.rows, divUp(train.rows, BLOCK_DIM_Y), 1);
|
||||
|
||||
calcDistance<BLOCK_DIM_X, BLOCK_DIM_Y, Dist, T><<<grid, threads, 0, stream>>>(query, train, mask, distance);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template <typename Dist, typename T, typename Mask>
|
||||
void calcDistanceDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, const DevMem2D& allDist, cudaStream_t stream)
|
||||
{
|
||||
calcDistance_caller<16, 16, Dist>(query, train, mask, static_cast<DevMem2Df>(allDist), stream);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// find knn match kernel
|
||||
|
||||
template <int BLOCK_SIZE> __global__ void findBestMatch(DevMem2Df allDist_, int i, PtrStepi trainIdx_, PtrStepf distance_)
|
||||
{
|
||||
const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64;
|
||||
__shared__ float sdist[SMEM_SIZE];
|
||||
__shared__ int strainIdx[SMEM_SIZE];
|
||||
|
||||
const int queryIdx = blockIdx.x;
|
||||
|
||||
float* allDist = allDist_.ptr(queryIdx);
|
||||
int* trainIdx = trainIdx_.ptr(queryIdx);
|
||||
float* distance = distance_.ptr(queryIdx);
|
||||
|
||||
float dist = numeric_limits<float>::max();
|
||||
int bestIdx = -1;
|
||||
|
||||
for (int i = threadIdx.x; i < allDist_.cols; i += BLOCK_SIZE)
|
||||
{
|
||||
float reg = allDist[i];
|
||||
if (reg < dist)
|
||||
{
|
||||
dist = reg;
|
||||
bestIdx = i;
|
||||
}
|
||||
}
|
||||
|
||||
sdist[threadIdx.x] = dist;
|
||||
strainIdx[threadIdx.x] = bestIdx;
|
||||
__syncthreads();
|
||||
|
||||
reducePredVal<BLOCK_SIZE>(sdist, dist, strainIdx, bestIdx, threadIdx.x, less<volatile float>());
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
if (dist < numeric_limits<float>::max())
|
||||
{
|
||||
allDist[bestIdx] = numeric_limits<float>::max();
|
||||
trainIdx[i] = bestIdx;
|
||||
distance[i] = dist;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// find knn match kernel caller
|
||||
|
||||
template <int BLOCK_SIZE> void findKnnMatch_caller(int k, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream)
|
||||
{
|
||||
const dim3 threads(BLOCK_SIZE, 1, 1);
|
||||
const dim3 grid(trainIdx.rows, 1, 1);
|
||||
|
||||
for (int i = 0; i < k; ++i)
|
||||
{
|
||||
findBestMatch<BLOCK_SIZE><<<grid, threads, 0, stream>>>(allDist, i, trainIdx, distance);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
}
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
void findKnnMatchDispatcher(int k, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, cudaStream_t stream)
|
||||
{
|
||||
findKnnMatch_caller<256>(k, static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Df>(distance), static_cast<DevMem2Df>(allDist), stream);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// knn match Dispatcher
|
||||
|
||||
template <typename Dist, typename T>
|
||||
void knnMatchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, int k, const DevMem2D& mask,
|
||||
const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist,
|
||||
int cc, cudaStream_t stream)
|
||||
{
|
||||
if (mask.data)
|
||||
{
|
||||
if (k == 2)
|
||||
{
|
||||
knnMatch2Dispatcher<Dist>(query, train, SingleMask(mask), trainIdx, distance, cc, stream);
|
||||
return;
|
||||
}
|
||||
|
||||
calcDistanceDispatcher<Dist>(query, train, SingleMask(mask), allDist, stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (k == 2)
|
||||
{
|
||||
knnMatch2Dispatcher<Dist>(query, train, WithOutMask(), trainIdx, distance, cc, stream);
|
||||
return;
|
||||
}
|
||||
|
||||
calcDistanceDispatcher<Dist>(query, train, WithOutMask(), allDist, stream);
|
||||
}
|
||||
|
||||
findKnnMatchDispatcher(k, trainIdx, distance, allDist, stream);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// knn match caller
|
||||
|
||||
template <typename T> void knnMatchL1_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask,
|
||||
const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist,
|
||||
int cc, cudaStream_t stream)
|
||||
{
|
||||
knnMatchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, mask, trainIdx, distance, allDist, cc, stream);
|
||||
}
|
||||
|
||||
template void knnMatchL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
|
||||
//template void knnMatchL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
|
||||
template void knnMatchL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
|
||||
template void knnMatchL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
|
||||
template void knnMatchL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
|
||||
template void knnMatchL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
|
||||
|
||||
template <typename T> void knnMatchL2_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask,
|
||||
const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist,
|
||||
int cc, cudaStream_t stream)
|
||||
{
|
||||
knnMatchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, mask, trainIdx, distance, allDist, cc, stream);
|
||||
}
|
||||
|
||||
//template void knnMatchL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
|
||||
//template void knnMatchL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
|
||||
//template void knnMatchL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
|
||||
//template void knnMatchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
|
||||
//template void knnMatchL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
|
||||
template void knnMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
|
||||
|
||||
template <typename T> void knnMatchHamming_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask,
|
||||
const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist,
|
||||
int cc, cudaStream_t stream)
|
||||
{
|
||||
knnMatchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, mask, trainIdx, distance, allDist, cc, stream);
|
||||
}
|
||||
|
||||
template void knnMatchHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
|
||||
//template void knnMatchHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
|
||||
template void knnMatchHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
|
||||
//template void knnMatchHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
|
||||
template void knnMatchHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
|
||||
}}}
|
403
modules/gpu/src/cuda/bf_match.cu
Normal file
403
modules/gpu/src/cuda/bf_match.cu
Normal file
@ -0,0 +1,403 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or bpied warranties, including, but not limited to, the bpied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#include "internal_shared.hpp"
|
||||
#include "opencv2/gpu/device/limits.hpp"
|
||||
#include "opencv2/gpu/device/vec_distance.hpp"
|
||||
|
||||
using namespace cv::gpu;
|
||||
using namespace cv::gpu::device;
|
||||
|
||||
namespace cv { namespace gpu { namespace bfmatcher
|
||||
{
|
||||
template <int BLOCK_DIM_Y, typename T>
|
||||
__device__ void findBestMatch(T& myDist, int2& myIdx, T* smin, int2* sIdx)
|
||||
{
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
smin[threadIdx.y] = myDist;
|
||||
sIdx[threadIdx.y] = myIdx;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
reducePredVal<BLOCK_DIM_Y>(smin, myDist, sIdx, myIdx, threadIdx.y * blockDim.x + threadIdx.x, less<volatile T>());
|
||||
}
|
||||
|
||||
template <typename Dist, typename VecDiff, typename T, typename Mask>
|
||||
__device__ void matchDescs(int queryIdx, int imgIdx, const DevMem2D_<T>& train, const Mask& m, const VecDiff& vecDiff,
|
||||
typename Dist::result_type& myDist, int2& myIdx, typename Dist::result_type* sdiff_row)
|
||||
{
|
||||
for (int trainIdx = threadIdx.y; trainIdx < train.rows; trainIdx += blockDim.y)
|
||||
{
|
||||
if (m(queryIdx, trainIdx))
|
||||
{
|
||||
const T* trainDescs = train.ptr(trainIdx);
|
||||
|
||||
Dist dist;
|
||||
|
||||
vecDiff.calc(trainDescs, train.cols, dist, sdiff_row, threadIdx.x);
|
||||
|
||||
const typename Dist::result_type res = dist;
|
||||
|
||||
if (res < myDist)
|
||||
{
|
||||
myDist = res;
|
||||
myIdx.x = trainIdx;
|
||||
myIdx.y = imgIdx;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T> struct SingleTrain
|
||||
{
|
||||
explicit SingleTrain(const DevMem2D_<T>& train_) : train(train_)
|
||||
{
|
||||
}
|
||||
|
||||
template <typename Dist, typename VecDiff, typename Mask>
|
||||
__device__ __forceinline__ void loop(int queryIdx, Mask& m, const VecDiff& vecDiff,
|
||||
typename Dist::result_type& myDist, int2& myIdx, typename Dist::result_type* sdiff_row) const
|
||||
{
|
||||
matchDescs<Dist>(queryIdx, 0, train, m, vecDiff, myDist, myIdx, sdiff_row);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ int desc_len() const
|
||||
{
|
||||
return train.cols;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void storeResult(float* distance, int* trainIdx, int* imgIdx,
|
||||
float myDist, const int2& myIdx, int queryIdx)
|
||||
{
|
||||
trainIdx[queryIdx] = myIdx.x;
|
||||
distance[queryIdx] = myDist;
|
||||
}
|
||||
|
||||
const DevMem2D_<T> train;
|
||||
};
|
||||
|
||||
template <typename T> struct TrainCollection
|
||||
{
|
||||
TrainCollection(const DevMem2D_<T>* trainCollection_, int nImg_, int desclen_) :
|
||||
trainCollection(trainCollection_), nImg(nImg_), desclen(desclen_)
|
||||
{
|
||||
}
|
||||
|
||||
template <typename Dist, typename VecDiff, typename Mask>
|
||||
__device__ void loop(int queryIdx, Mask& m, const VecDiff& vecDiff,
|
||||
typename Dist::result_type& myDist, int2& myIdx, typename Dist::result_type* sdiff_row) const
|
||||
{
|
||||
for (int imgIdx = 0; imgIdx < nImg; ++imgIdx)
|
||||
{
|
||||
const DevMem2D_<T> train = trainCollection[imgIdx];
|
||||
m.next();
|
||||
matchDescs<Dist>(queryIdx, imgIdx, train, m, vecDiff, myDist, myIdx, sdiff_row);
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ int desc_len() const
|
||||
{
|
||||
return desclen;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void storeResult(float* distance, int* trainIdx, int* imgIdx,
|
||||
float myDist, const int2& myIdx, int queryIdx)
|
||||
{
|
||||
trainIdx[queryIdx] = myIdx.x;
|
||||
imgIdx[queryIdx] = myIdx.y;
|
||||
distance[queryIdx] = myDist;
|
||||
}
|
||||
|
||||
const DevMem2D_<T>* trainCollection;
|
||||
const int nImg;
|
||||
const int desclen;
|
||||
};
|
||||
|
||||
template <typename VecDiff, typename Dist, typename T, typename Train, typename Mask>
|
||||
__device__ void distanceCalcLoop(const PtrStep_<T>& query, const Train& train, const Mask& mask, int queryIdx,
|
||||
typename Dist::result_type& myDist, int2& myIdx, typename Dist::result_type* smem)
|
||||
{
|
||||
const VecDiff vecDiff(query.ptr(queryIdx), train.desc_len(), (typename Dist::value_type*)smem, threadIdx.y * blockDim.x + threadIdx.x, threadIdx.x);
|
||||
|
||||
typename Dist::result_type* sdiff_row = smem + blockDim.x * threadIdx.y;
|
||||
|
||||
Mask m = mask;
|
||||
|
||||
myIdx.x = -1;
|
||||
myIdx.y = -1;
|
||||
myDist = numeric_limits<typename Dist::result_type>::max();
|
||||
|
||||
train.template loop<Dist>(queryIdx, m, vecDiff, myDist, myIdx, sdiff_row);
|
||||
}
|
||||
|
||||
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename VecDiff, typename Dist, typename T, typename Train, typename Mask>
|
||||
__global__ void match(const PtrStep_<T> query, const Train train, const Mask mask, int* trainIdx, int* imgIdx, float* distance)
|
||||
{
|
||||
__shared__ typename Dist::result_type smem[BLOCK_DIM_X * BLOCK_DIM_Y];
|
||||
|
||||
const int queryIdx = blockIdx.x;
|
||||
|
||||
int2 myIdx;
|
||||
typename Dist::result_type myDist;
|
||||
|
||||
distanceCalcLoop<VecDiff, Dist>(query, train, mask, queryIdx, myDist, myIdx, smem);
|
||||
__syncthreads();
|
||||
|
||||
typename Dist::result_type* smin = smem;
|
||||
int2* sIdx = (int2*)(smin + BLOCK_DIM_Y);
|
||||
|
||||
findBestMatch<BLOCK_DIM_Y>(myDist, myIdx, smin, sIdx);
|
||||
|
||||
if (threadIdx.x == 0 && threadIdx.y == 0)
|
||||
Train::storeResult(distance, trainIdx, imgIdx, myDist, myIdx, queryIdx);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Match kernel caller
|
||||
|
||||
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Train, typename Mask>
|
||||
void matchSimple_caller(const DevMem2D_<T>& query, const Train& train, const Mask& mask,
|
||||
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
|
||||
cudaStream_t stream)
|
||||
{
|
||||
StaticAssert<BLOCK_DIM_Y <= 64>::check(); // blockDimY vals must reduce by warp
|
||||
|
||||
const dim3 grid(query.rows, 1, 1);
|
||||
const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
|
||||
|
||||
match<BLOCK_DIM_X, BLOCK_DIM_Y, VecDiffGlobal<BLOCK_DIM_X, T>, Dist, T>
|
||||
<<<grid, threads, 0, stream>>>(query, train, mask, trainIdx.data, imgIdx.data, distance.data);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename Dist, typename T, typename Train, typename Mask>
|
||||
void matchCached_caller(const DevMem2D_<T>& query, const Train& train, const Mask& mask,
|
||||
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
|
||||
cudaStream_t stream)
|
||||
{
|
||||
StaticAssert<BLOCK_DIM_Y <= 64>::check(); // blockDimY vals must reduce by warp
|
||||
StaticAssert<BLOCK_DIM_X * BLOCK_DIM_Y >= MAX_LEN>::check(); // block size must be greter than descriptors length
|
||||
StaticAssert<MAX_LEN % BLOCK_DIM_X == 0>::check(); // max descriptors length must divide to blockDimX
|
||||
|
||||
const dim3 grid(query.rows, 1, 1);
|
||||
const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
|
||||
|
||||
match<BLOCK_DIM_X, BLOCK_DIM_Y, VecDiffCachedRegister<BLOCK_DIM_X, MAX_LEN, LEN_EQ_MAX_LEN, typename Dist::value_type>, Dist, T>
|
||||
<<<grid, threads, 0, stream>>>(query, train, mask, trainIdx.data, imgIdx.data, distance.data);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Match Dispatcher
|
||||
|
||||
template <typename Dist, typename T, typename Train, typename Mask>
|
||||
void matchDispatcher(const DevMem2D_<T>& query, const Train& train, const Mask& mask,
|
||||
const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance,
|
||||
int cc, cudaStream_t stream)
|
||||
{
|
||||
if (query.cols < 64)
|
||||
{
|
||||
matchCached_caller<16, 16, 64, false, Dist>(
|
||||
query, train, mask,
|
||||
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance),
|
||||
stream);
|
||||
}
|
||||
else if (query.cols == 64)
|
||||
{
|
||||
matchCached_caller<16, 16, 64, true, Dist>(
|
||||
query, train, mask,
|
||||
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance),
|
||||
stream);
|
||||
}
|
||||
else if (query.cols < 128)
|
||||
{
|
||||
matchCached_caller<16, 16, 128, false, Dist>(
|
||||
query, train, mask,
|
||||
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance),
|
||||
stream);
|
||||
}
|
||||
else if (query.cols == 128 && cc >= 12)
|
||||
{
|
||||
matchCached_caller<16, 16, 128, true, Dist>(
|
||||
query, train, mask,
|
||||
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance),
|
||||
stream);
|
||||
}
|
||||
else if (query.cols < 256 && cc >= 12)
|
||||
{
|
||||
matchCached_caller<16, 16, 256, false, Dist>(
|
||||
query, train, mask,
|
||||
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance),
|
||||
stream);
|
||||
}
|
||||
else if (query.cols == 256 && cc >= 12)
|
||||
{
|
||||
matchCached_caller<16, 16, 256, true, Dist>(
|
||||
query, train, mask,
|
||||
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance),
|
||||
stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
matchSimple_caller<16, 16, Dist>(
|
||||
query, train, mask,
|
||||
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance),
|
||||
stream);
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Match caller
|
||||
|
||||
template <typename T> void matchSingleL1_gpu(const DevMem2D& query, const DevMem2D& train_, const DevMem2D& mask,
|
||||
const DevMem2D& trainIdx, const DevMem2D& distance,
|
||||
int cc, cudaStream_t stream)
|
||||
{
|
||||
SingleTrain<T> train(static_cast< DevMem2D_<T> >(train_));
|
||||
if (mask.data)
|
||||
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), train, SingleMask(mask), trainIdx, DevMem2D(), distance, cc, stream);
|
||||
else
|
||||
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, DevMem2D(), distance, cc, stream);
|
||||
}
|
||||
|
||||
template void matchSingleL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
//template void matchSingleL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
template void matchSingleL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
template void matchSingleL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
template void matchSingleL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
template void matchSingleL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
|
||||
template <typename T> void matchSingleL2_gpu(const DevMem2D& query, const DevMem2D& train_, const DevMem2D& mask,
|
||||
const DevMem2D& trainIdx, const DevMem2D& distance,
|
||||
int cc, cudaStream_t stream)
|
||||
{
|
||||
SingleTrain<T> train(static_cast< DevMem2D_<T> >(train_));
|
||||
if (mask.data)
|
||||
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), train, SingleMask(mask), trainIdx, DevMem2D(), distance, cc, stream);
|
||||
else
|
||||
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, DevMem2D(), distance, cc, stream);
|
||||
}
|
||||
|
||||
//template void matchSingleL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
//template void matchSingleL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
//template void matchSingleL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
//template void matchSingleL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
//template void matchSingleL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
template void matchSingleL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
|
||||
template <typename T> void matchSingleHamming_gpu(const DevMem2D& query, const DevMem2D& train_, const DevMem2D& mask,
|
||||
const DevMem2D& trainIdx, const DevMem2D& distance,
|
||||
int cc, cudaStream_t stream)
|
||||
{
|
||||
SingleTrain<T> train(static_cast< DevMem2D_<T> >(train_));
|
||||
if (mask.data)
|
||||
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), train, SingleMask(mask), trainIdx, DevMem2D(), distance, cc, stream);
|
||||
else
|
||||
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, DevMem2D(), distance, cc, stream);
|
||||
}
|
||||
|
||||
template void matchSingleHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
//template void matchSingleHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
template void matchSingleHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
//template void matchSingleHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
template void matchSingleHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
|
||||
template <typename T> void matchCollectionL1_gpu(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection,
|
||||
const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance,
|
||||
int cc, cudaStream_t stream)
|
||||
{
|
||||
TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, query.cols);
|
||||
if (maskCollection.data)
|
||||
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), train, MaskCollection(maskCollection.data), trainIdx, imgIdx, distance, cc, stream);
|
||||
else
|
||||
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
|
||||
}
|
||||
|
||||
template void matchCollectionL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
//template void matchCollectionL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
template void matchCollectionL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
template void matchCollectionL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
template void matchCollectionL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
template void matchCollectionL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
|
||||
template <typename T> void matchCollectionL2_gpu(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection,
|
||||
const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance,
|
||||
int cc, cudaStream_t stream)
|
||||
{
|
||||
TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, query.cols);
|
||||
if (maskCollection.data)
|
||||
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), train, MaskCollection(maskCollection.data), trainIdx, imgIdx, distance, cc, stream);
|
||||
else
|
||||
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
|
||||
}
|
||||
|
||||
//template void matchCollectionL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
//template void matchCollectionL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
//template void matchCollectionL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
//template void matchCollectionL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
//template void matchCollectionL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
template void matchCollectionL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
|
||||
template <typename T> void matchCollectionHamming_gpu(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection,
|
||||
const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance,
|
||||
int cc, cudaStream_t stream)
|
||||
{
|
||||
TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, query.cols);
|
||||
if (maskCollection.data)
|
||||
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), train, MaskCollection(maskCollection.data), trainIdx, imgIdx, distance, cc, stream);
|
||||
else
|
||||
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
|
||||
}
|
||||
|
||||
template void matchCollectionHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
//template void matchCollectionHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
template void matchCollectionHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
//template void matchCollectionHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
template void matchCollectionHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
}}}
|
202
modules/gpu/src/cuda/bf_radius_match.cu
Normal file
202
modules/gpu/src/cuda/bf_radius_match.cu
Normal file
@ -0,0 +1,202 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or bpied warranties, including, but not limited to, the bpied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#include "internal_shared.hpp"
|
||||
#include "opencv2/gpu/device/limits.hpp"
|
||||
#include "opencv2/gpu/device/vec_distance.hpp"
|
||||
|
||||
using namespace cv::gpu;
|
||||
using namespace cv::gpu::device;
|
||||
|
||||
namespace cv { namespace gpu { namespace bfmatcher
|
||||
{
|
||||
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
|
||||
__global__ void radiusMatch(const PtrStep_<T> query, const DevMem2D_<T> train, float maxDistance, const Mask mask,
|
||||
DevMem2Di trainIdx_, unsigned int* nMatches, PtrStepf distance)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 110
|
||||
|
||||
__shared__ typename Dist::result_type smem[BLOCK_DIM_X * BLOCK_DIM_Y];
|
||||
|
||||
typename Dist::result_type* sdiff_row = smem + BLOCK_DIM_X * threadIdx.y;
|
||||
|
||||
const int queryIdx = blockIdx.x;
|
||||
const T* queryDescs = query.ptr(queryIdx);
|
||||
|
||||
const int trainIdx = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;
|
||||
|
||||
if (trainIdx < train.rows)
|
||||
{
|
||||
const T* trainDescs = train.ptr(trainIdx);
|
||||
|
||||
if (mask(queryIdx, trainIdx))
|
||||
{
|
||||
Dist dist;
|
||||
|
||||
calcVecDiffGlobal<BLOCK_DIM_X>(queryDescs, trainDescs, train.cols, dist, sdiff_row, threadIdx.x);
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
if (dist < maxDistance)
|
||||
{
|
||||
unsigned int i = atomicInc(nMatches + queryIdx, (unsigned int) -1);
|
||||
if (i < trainIdx_.cols)
|
||||
{
|
||||
distance.ptr(queryIdx)[i] = dist;
|
||||
trainIdx_.ptr(queryIdx)[i] = trainIdx;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Radius Match kernel caller
|
||||
|
||||
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
|
||||
void radiusMatch_caller(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask,
|
||||
const DevMem2Di& trainIdx, const DevMem2D_<unsigned int>& nMatches, const DevMem2Df& distance,
|
||||
cudaStream_t stream)
|
||||
{
|
||||
const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
|
||||
const dim3 grid(query.rows, divUp(train.rows, BLOCK_DIM_Y), 1);
|
||||
|
||||
radiusMatch<BLOCK_DIM_X, BLOCK_DIM_Y, Dist, T><<<grid, threads, 0, stream>>>(query, train, maxDistance, mask, trainIdx, nMatches.data, distance);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Radius Match Dispatcher
|
||||
|
||||
template <typename Dist, typename T, typename Mask>
|
||||
void radiusMatchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask,
|
||||
const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance,
|
||||
cudaStream_t stream)
|
||||
{
|
||||
radiusMatch_caller<16, 16, Dist>(query, train, maxDistance, mask,
|
||||
static_cast<DevMem2Di>(trainIdx), static_cast< const DevMem2D_<unsigned int> >(nMatches), static_cast<DevMem2Df>(distance),
|
||||
stream);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Radius Match caller
|
||||
|
||||
template <typename T> void radiusMatchL1_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask,
|
||||
const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance,
|
||||
cudaStream_t stream)
|
||||
{
|
||||
if (mask.data)
|
||||
{
|
||||
radiusMatchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask),
|
||||
trainIdx, nMatches, distance,
|
||||
stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
radiusMatchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(),
|
||||
trainIdx, nMatches, distance,
|
||||
stream);
|
||||
}
|
||||
}
|
||||
|
||||
template void radiusMatchL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
|
||||
//template void radiusMatchL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
|
||||
template void radiusMatchL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
|
||||
template void radiusMatchL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
|
||||
template void radiusMatchL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
|
||||
template void radiusMatchL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
|
||||
|
||||
template <typename T> void radiusMatchL2_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask,
|
||||
const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance,
|
||||
cudaStream_t stream)
|
||||
{
|
||||
if (mask.data)
|
||||
{
|
||||
radiusMatchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask),
|
||||
trainIdx, nMatches, distance,
|
||||
stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
radiusMatchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(),
|
||||
trainIdx, nMatches, distance,
|
||||
stream);
|
||||
}
|
||||
}
|
||||
|
||||
//template void radiusMatchL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
|
||||
//template void radiusMatchL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
|
||||
//template void radiusMatchL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
|
||||
//template void radiusMatchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
|
||||
//template void radiusMatchL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
|
||||
template void radiusMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
|
||||
|
||||
template <typename T> void radiusMatchHamming_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask,
|
||||
const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance,
|
||||
cudaStream_t stream)
|
||||
{
|
||||
if (mask.data)
|
||||
{
|
||||
radiusMatchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask),
|
||||
trainIdx, nMatches, distance,
|
||||
stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
radiusMatchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(),
|
||||
trainIdx, nMatches, distance,
|
||||
stream);
|
||||
}
|
||||
}
|
||||
|
||||
template void radiusMatchHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
|
||||
//template void radiusMatchHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
|
||||
template void radiusMatchHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
|
||||
//template void radiusMatchHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
|
||||
template void radiusMatchHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
|
||||
}}}
|
233
modules/gpu/src/cuda/bilateral_filter.cu
Normal file
233
modules/gpu/src/cuda/bilateral_filter.cu
Normal file
@ -0,0 +1,233 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#include "internal_shared.hpp"
|
||||
#include "opencv2/gpu/device/limits.hpp"
|
||||
|
||||
using namespace cv::gpu;
|
||||
using namespace cv::gpu::device;
|
||||
|
||||
namespace bf_krnls
|
||||
{
|
||||
__constant__ float* ctable_color;
|
||||
__constant__ float* ctable_space;
|
||||
__constant__ size_t ctable_space_step;
|
||||
|
||||
__constant__ int cndisp;
|
||||
__constant__ int cradius;
|
||||
|
||||
__constant__ short cedge_disc;
|
||||
__constant__ short cmax_disc;
|
||||
}
|
||||
|
||||
namespace cv { namespace gpu { namespace bf
|
||||
{
|
||||
void load_constants(float* table_color, const DevMem2Df& table_space, int ndisp, int radius, short edge_disc, short max_disc)
|
||||
{
|
||||
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_color, &table_color, sizeof(table_color)) );
|
||||
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space, &table_space.data, sizeof(table_space.data)) );
|
||||
size_t table_space_step = table_space.step / sizeof(float);
|
||||
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space_step, &table_space_step, sizeof(size_t)) );
|
||||
|
||||
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cndisp, &ndisp, sizeof(int)) );
|
||||
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cradius, &radius, sizeof(int)) );
|
||||
|
||||
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cedge_disc, &edge_disc, sizeof(short)) );
|
||||
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cmax_disc, &max_disc, sizeof(short)) );
|
||||
}
|
||||
}}}
|
||||
|
||||
namespace bf_krnls
|
||||
{
|
||||
template <int channels>
|
||||
struct DistRgbMax
|
||||
{
|
||||
static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
|
||||
{
|
||||
uchar x = abs(a[0] - b[0]);
|
||||
uchar y = abs(a[1] - b[1]);
|
||||
uchar z = abs(a[2] - b[2]);
|
||||
return (max(max(x, y), z));
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct DistRgbMax<1>
|
||||
{
|
||||
static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
|
||||
{
|
||||
return abs(a[0] - b[0]);
|
||||
}
|
||||
};
|
||||
|
||||
template <int channels, typename T>
|
||||
__global__ void bilateral_filter(int t, T* disp, size_t disp_step, const uchar* img, size_t img_step, int h, int w)
|
||||
{
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);
|
||||
|
||||
T dp[5];
|
||||
|
||||
if (y > 0 && y < h - 1 && x > 0 && x < w - 1)
|
||||
{
|
||||
dp[0] = *(disp + (y ) * disp_step + x + 0);
|
||||
dp[1] = *(disp + (y-1) * disp_step + x + 0);
|
||||
dp[2] = *(disp + (y ) * disp_step + x - 1);
|
||||
dp[3] = *(disp + (y+1) * disp_step + x + 0);
|
||||
dp[4] = *(disp + (y ) * disp_step + x + 1);
|
||||
|
||||
if(abs(dp[1] - dp[0]) >= cedge_disc || abs(dp[2] - dp[0]) >= cedge_disc || abs(dp[3] - dp[0]) >= cedge_disc || abs(dp[4] - dp[0]) >= cedge_disc)
|
||||
{
|
||||
const int ymin = max(0, y - cradius);
|
||||
const int xmin = max(0, x - cradius);
|
||||
const int ymax = min(h - 1, y + cradius);
|
||||
const int xmax = min(w - 1, x + cradius);
|
||||
|
||||
float cost[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
|
||||
|
||||
const uchar* ic = img + y * img_step + channels * x;
|
||||
|
||||
for(int yi = ymin; yi <= ymax; yi++)
|
||||
{
|
||||
const T* disp_y = disp + yi * disp_step;
|
||||
|
||||
for(int xi = xmin; xi <= xmax; xi++)
|
||||
{
|
||||
const uchar* in = img + yi * img_step + channels * xi;
|
||||
|
||||
uchar dist_rgb = DistRgbMax<channels>::calc(in, ic);
|
||||
|
||||
const float weight = ctable_color[dist_rgb] * (ctable_space + abs(y-yi)* ctable_space_step)[abs(x-xi)];
|
||||
|
||||
const T disp_reg = disp_y[xi];
|
||||
|
||||
cost[0] += min(cmax_disc, abs(disp_reg - dp[0])) * weight;
|
||||
cost[1] += min(cmax_disc, abs(disp_reg - dp[1])) * weight;
|
||||
cost[2] += min(cmax_disc, abs(disp_reg - dp[2])) * weight;
|
||||
cost[3] += min(cmax_disc, abs(disp_reg - dp[3])) * weight;
|
||||
cost[4] += min(cmax_disc, abs(disp_reg - dp[4])) * weight;
|
||||
}
|
||||
}
|
||||
|
||||
float minimum = numeric_limits<float>::max();
|
||||
int id = 0;
|
||||
|
||||
if (cost[0] < minimum)
|
||||
{
|
||||
minimum = cost[0];
|
||||
id = 0;
|
||||
}
|
||||
if (cost[1] < minimum)
|
||||
{
|
||||
minimum = cost[1];
|
||||
id = 1;
|
||||
}
|
||||
if (cost[2] < minimum)
|
||||
{
|
||||
minimum = cost[2];
|
||||
id = 2;
|
||||
}
|
||||
if (cost[3] < minimum)
|
||||
{
|
||||
minimum = cost[3];
|
||||
id = 3;
|
||||
}
|
||||
if (cost[4] < minimum)
|
||||
{
|
||||
minimum = cost[4];
|
||||
id = 4;
|
||||
}
|
||||
|
||||
*(disp + y * disp_step + x) = dp[id];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
namespace cv { namespace gpu { namespace bf
|
||||
{
|
||||
template <typename T>
|
||||
void bilateral_filter_caller(const DevMem2D_<T>& disp, const DevMem2D& img, int channels, int iters, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(32, 8, 1);
|
||||
dim3 grid(1, 1, 1);
|
||||
grid.x = divUp(disp.cols, threads.x << 1);
|
||||
grid.y = divUp(disp.rows, threads.y);
|
||||
|
||||
switch (channels)
|
||||
{
|
||||
case 1:
|
||||
for (int i = 0; i < iters; ++i)
|
||||
{
|
||||
bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
}
|
||||
break;
|
||||
case 3:
|
||||
for (int i = 0; i < iters; ++i)
|
||||
{
|
||||
bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
}
|
||||
break;
|
||||
default:
|
||||
cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);
|
||||
}
|
||||
|
||||
if (stream != 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
void bilateral_filter_gpu(const DevMem2D& disp, const DevMem2D& img, int channels, int iters, cudaStream_t stream)
|
||||
{
|
||||
bilateral_filter_caller(disp, img, channels, iters, stream);
|
||||
}
|
||||
|
||||
void bilateral_filter_gpu(const DevMem2D_<short>& disp, const DevMem2D& img, int channels, int iters, cudaStream_t stream)
|
||||
{
|
||||
bilateral_filter_caller(disp, img, channels, iters, stream);
|
||||
}
|
||||
}}}
|
@ -1,980 +0,0 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or bpied warranties, including, but not limited to, the bpied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#include "internal_shared.hpp"
|
||||
#include "opencv2/gpu/device/limits.hpp"
|
||||
#include "opencv2/gpu/device/vec_distance.hpp"
|
||||
|
||||
using namespace cv::gpu;
|
||||
using namespace cv::gpu::device;
|
||||
|
||||
namespace cv { namespace gpu { namespace bfmatcher
|
||||
{
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////
|
||||
////////////////////////////////////// Match //////////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <int BLOCK_DIM_Y, typename T>
|
||||
__device__ void findBestMatch(T& myDist, int2& myIdx, T* smin, int2* sIdx)
|
||||
{
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
smin[threadIdx.y] = myDist;
|
||||
sIdx[threadIdx.y] = myIdx;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
reducePredVal<BLOCK_DIM_Y>(smin, myDist, sIdx, myIdx, threadIdx.y * blockDim.x + threadIdx.x, less<volatile T>());
|
||||
}
|
||||
|
||||
template <typename Dist, typename VecDiff, typename T, typename Mask>
|
||||
__device__ void matchDescs(int queryIdx, int imgIdx, const DevMem2D_<T>& train, const Mask& m, const VecDiff& vecDiff,
|
||||
typename Dist::result_type& myDist, int2& myIdx, typename Dist::result_type* sdiff_row)
|
||||
{
|
||||
for (int trainIdx = threadIdx.y; trainIdx < train.rows; trainIdx += blockDim.y)
|
||||
{
|
||||
if (m(queryIdx, trainIdx))
|
||||
{
|
||||
const T* trainDescs = train.ptr(trainIdx);
|
||||
|
||||
Dist dist;
|
||||
|
||||
vecDiff.calc(trainDescs, train.cols, dist, sdiff_row, threadIdx.x);
|
||||
|
||||
const typename Dist::result_type res = dist;
|
||||
|
||||
if (res < myDist)
|
||||
{
|
||||
myDist = res;
|
||||
myIdx.x = trainIdx;
|
||||
myIdx.y = imgIdx;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T> struct SingleTrain
|
||||
{
|
||||
explicit SingleTrain(const DevMem2D_<T>& train_) : train(train_)
|
||||
{
|
||||
}
|
||||
|
||||
template <typename Dist, typename VecDiff, typename Mask>
|
||||
__device__ __forceinline__ void loop(int queryIdx, Mask& m, const VecDiff& vecDiff,
|
||||
typename Dist::result_type& myDist, int2& myIdx, typename Dist::result_type* sdiff_row) const
|
||||
{
|
||||
matchDescs<Dist>(queryIdx, 0, train, m, vecDiff, myDist, myIdx, sdiff_row);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ int desc_len() const
|
||||
{
|
||||
return train.cols;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void storeResult(float* distance, int* trainIdx, int* imgIdx,
|
||||
float myDist, const int2& myIdx, int queryIdx)
|
||||
{
|
||||
trainIdx[queryIdx] = myIdx.x;
|
||||
distance[queryIdx] = myDist;
|
||||
}
|
||||
|
||||
const DevMem2D_<T> train;
|
||||
};
|
||||
|
||||
template <typename T> struct TrainCollection
|
||||
{
|
||||
TrainCollection(const DevMem2D_<T>* trainCollection_, int nImg_, int desclen_) :
|
||||
trainCollection(trainCollection_), nImg(nImg_), desclen(desclen_)
|
||||
{
|
||||
}
|
||||
|
||||
template <typename Dist, typename VecDiff, typename Mask>
|
||||
__device__ void loop(int queryIdx, Mask& m, const VecDiff& vecDiff,
|
||||
typename Dist::result_type& myDist, int2& myIdx, typename Dist::result_type* sdiff_row) const
|
||||
{
|
||||
for (int imgIdx = 0; imgIdx < nImg; ++imgIdx)
|
||||
{
|
||||
const DevMem2D_<T> train = trainCollection[imgIdx];
|
||||
m.next();
|
||||
matchDescs<Dist>(queryIdx, imgIdx, train, m, vecDiff, myDist, myIdx, sdiff_row);
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ int desc_len() const
|
||||
{
|
||||
return desclen;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void storeResult(float* distance, int* trainIdx, int* imgIdx,
|
||||
float myDist, const int2& myIdx, int queryIdx)
|
||||
{
|
||||
trainIdx[queryIdx] = myIdx.x;
|
||||
imgIdx[queryIdx] = myIdx.y;
|
||||
distance[queryIdx] = myDist;
|
||||
}
|
||||
|
||||
const DevMem2D_<T>* trainCollection;
|
||||
const int nImg;
|
||||
const int desclen;
|
||||
};
|
||||
|
||||
template <typename VecDiff, typename Dist, typename T, typename Train, typename Mask>
|
||||
__device__ void distanceCalcLoop(const PtrStep_<T>& query, const Train& train, const Mask& mask, int queryIdx,
|
||||
typename Dist::result_type& myDist, int2& myIdx, typename Dist::result_type* smem)
|
||||
{
|
||||
const VecDiff vecDiff(query.ptr(queryIdx), train.desc_len(), (typename Dist::value_type*)smem, threadIdx.y * blockDim.x + threadIdx.x, threadIdx.x);
|
||||
|
||||
typename Dist::result_type* sdiff_row = smem + blockDim.x * threadIdx.y;
|
||||
|
||||
Mask m = mask;
|
||||
|
||||
myIdx.x = -1;
|
||||
myIdx.y = -1;
|
||||
myDist = numeric_limits<typename Dist::result_type>::max();
|
||||
|
||||
train.template loop<Dist>(queryIdx, m, vecDiff, myDist, myIdx, sdiff_row);
|
||||
}
|
||||
|
||||
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename VecDiff, typename Dist, typename T, typename Train, typename Mask>
|
||||
__global__ void match(const PtrStep_<T> query, const Train train, const Mask mask, int* trainIdx, int* imgIdx, float* distance)
|
||||
{
|
||||
__shared__ typename Dist::result_type smem[BLOCK_DIM_X * BLOCK_DIM_Y];
|
||||
|
||||
const int queryIdx = blockIdx.x;
|
||||
|
||||
int2 myIdx;
|
||||
typename Dist::result_type myDist;
|
||||
|
||||
distanceCalcLoop<VecDiff, Dist>(query, train, mask, queryIdx, myDist, myIdx, smem);
|
||||
__syncthreads();
|
||||
|
||||
typename Dist::result_type* smin = smem;
|
||||
int2* sIdx = (int2*)(smin + BLOCK_DIM_Y);
|
||||
|
||||
findBestMatch<BLOCK_DIM_Y>(myDist, myIdx, smin, sIdx);
|
||||
|
||||
if (threadIdx.x == 0 && threadIdx.y == 0)
|
||||
Train::storeResult(distance, trainIdx, imgIdx, myDist, myIdx, queryIdx);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Match kernel caller
|
||||
|
||||
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Train, typename Mask>
|
||||
void matchSimple_caller(const DevMem2D_<T>& query, const Train& train, const Mask& mask,
|
||||
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
|
||||
cudaStream_t stream)
|
||||
{
|
||||
StaticAssert<BLOCK_DIM_Y <= 64>::check(); // blockDimY vals must reduce by warp
|
||||
|
||||
const dim3 grid(query.rows, 1, 1);
|
||||
const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
|
||||
|
||||
match<BLOCK_DIM_X, BLOCK_DIM_Y, VecDiffGlobal<BLOCK_DIM_X, T>, Dist, T>
|
||||
<<<grid, threads, 0, stream>>>(query, train, mask, trainIdx.data, imgIdx.data, distance.data);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename Dist, typename T, typename Train, typename Mask>
|
||||
void matchCached_caller(const DevMem2D_<T>& query, const Train& train, const Mask& mask,
|
||||
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
|
||||
cudaStream_t stream)
|
||||
{
|
||||
StaticAssert<BLOCK_DIM_Y <= 64>::check(); // blockDimY vals must reduce by warp
|
||||
StaticAssert<BLOCK_DIM_X * BLOCK_DIM_Y >= MAX_LEN>::check(); // block size must be greter than descriptors length
|
||||
StaticAssert<MAX_LEN % BLOCK_DIM_X == 0>::check(); // max descriptors length must divide to blockDimX
|
||||
|
||||
const dim3 grid(query.rows, 1, 1);
|
||||
const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
|
||||
|
||||
match<BLOCK_DIM_X, BLOCK_DIM_Y, VecDiffCachedRegister<BLOCK_DIM_X, MAX_LEN, LEN_EQ_MAX_LEN, typename Dist::value_type>, Dist, T>
|
||||
<<<grid, threads, 0, stream>>>(query, train, mask, trainIdx.data, imgIdx.data, distance.data);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Match Dispatcher
|
||||
|
||||
template <typename Dist, typename T, typename Train, typename Mask>
|
||||
void matchDispatcher(const DevMem2D_<T>& query, const Train& train, const Mask& mask,
|
||||
const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance,
|
||||
int cc, cudaStream_t stream)
|
||||
{
|
||||
if (query.cols < 64)
|
||||
{
|
||||
matchCached_caller<16, 16, 64, false, Dist>(
|
||||
query, train, mask,
|
||||
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance),
|
||||
stream);
|
||||
}
|
||||
else if (query.cols == 64)
|
||||
{
|
||||
matchCached_caller<16, 16, 64, true, Dist>(
|
||||
query, train, mask,
|
||||
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance),
|
||||
stream);
|
||||
}
|
||||
else if (query.cols < 128)
|
||||
{
|
||||
matchCached_caller<16, 16, 128, false, Dist>(
|
||||
query, train, mask,
|
||||
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance),
|
||||
stream);
|
||||
}
|
||||
else if (query.cols == 128 && cc >= 12)
|
||||
{
|
||||
matchCached_caller<16, 16, 128, true, Dist>(
|
||||
query, train, mask,
|
||||
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance),
|
||||
stream);
|
||||
}
|
||||
else if (query.cols < 256 && cc >= 12)
|
||||
{
|
||||
matchCached_caller<16, 16, 256, false, Dist>(
|
||||
query, train, mask,
|
||||
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance),
|
||||
stream);
|
||||
}
|
||||
else if (query.cols == 256 && cc >= 12)
|
||||
{
|
||||
matchCached_caller<16, 16, 256, true, Dist>(
|
||||
query, train, mask,
|
||||
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance),
|
||||
stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
matchSimple_caller<16, 16, Dist>(
|
||||
query, train, mask,
|
||||
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance),
|
||||
stream);
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Match caller
|
||||
|
||||
template <typename T> void matchSingleL1_gpu(const DevMem2D& query, const DevMem2D& train_, const DevMem2D& mask,
|
||||
const DevMem2D& trainIdx, const DevMem2D& distance,
|
||||
int cc, cudaStream_t stream)
|
||||
{
|
||||
SingleTrain<T> train(static_cast< DevMem2D_<T> >(train_));
|
||||
if (mask.data)
|
||||
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), train, SingleMask(mask), trainIdx, DevMem2D(), distance, cc, stream);
|
||||
else
|
||||
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, DevMem2D(), distance, cc, stream);
|
||||
}
|
||||
|
||||
template void matchSingleL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
template void matchSingleL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
template void matchSingleL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
template void matchSingleL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
template void matchSingleL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
template void matchSingleL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
|
||||
template <typename T> void matchSingleL2_gpu(const DevMem2D& query, const DevMem2D& train_, const DevMem2D& mask,
|
||||
const DevMem2D& trainIdx, const DevMem2D& distance,
|
||||
int cc, cudaStream_t stream)
|
||||
{
|
||||
SingleTrain<T> train(static_cast< DevMem2D_<T> >(train_));
|
||||
if (mask.data)
|
||||
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), train, SingleMask(mask), trainIdx, DevMem2D(), distance, cc, stream);
|
||||
else
|
||||
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, DevMem2D(), distance, cc, stream);
|
||||
}
|
||||
|
||||
template void matchSingleL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
template void matchSingleL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
template void matchSingleL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
template void matchSingleL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
template void matchSingleL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
template void matchSingleL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
|
||||
template <typename T> void matchSingleHamming_gpu(const DevMem2D& query, const DevMem2D& train_, const DevMem2D& mask,
|
||||
const DevMem2D& trainIdx, const DevMem2D& distance,
|
||||
int cc, cudaStream_t stream)
|
||||
{
|
||||
SingleTrain<T> train(static_cast< DevMem2D_<T> >(train_));
|
||||
if (mask.data)
|
||||
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), train, SingleMask(mask), trainIdx, DevMem2D(), distance, cc, stream);
|
||||
else
|
||||
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, DevMem2D(), distance, cc, stream);
|
||||
}
|
||||
|
||||
template void matchSingleHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
template void matchSingleHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
template void matchSingleHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
template void matchSingleHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
template void matchSingleHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
|
||||
template <typename T> void matchCollectionL1_gpu(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection,
|
||||
const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance,
|
||||
int cc, cudaStream_t stream)
|
||||
{
|
||||
TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, query.cols);
|
||||
if (maskCollection.data)
|
||||
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), train, MaskCollection(maskCollection.data), trainIdx, imgIdx, distance, cc, stream);
|
||||
else
|
||||
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
|
||||
}
|
||||
|
||||
template void matchCollectionL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
template void matchCollectionL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
template void matchCollectionL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
template void matchCollectionL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
template void matchCollectionL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
template void matchCollectionL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
|
||||
template <typename T> void matchCollectionL2_gpu(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection,
|
||||
const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance,
|
||||
int cc, cudaStream_t stream)
|
||||
{
|
||||
TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, query.cols);
|
||||
if (maskCollection.data)
|
||||
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), train, MaskCollection(maskCollection.data), trainIdx, imgIdx, distance, cc, stream);
|
||||
else
|
||||
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
|
||||
}
|
||||
|
||||
template void matchCollectionL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
template void matchCollectionL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
template void matchCollectionL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
template void matchCollectionL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
template void matchCollectionL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
template void matchCollectionL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
|
||||
template <typename T> void matchCollectionHamming_gpu(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection,
|
||||
const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance,
|
||||
int cc, cudaStream_t stream)
|
||||
{
|
||||
TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, query.cols);
|
||||
if (maskCollection.data)
|
||||
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), train, MaskCollection(maskCollection.data), trainIdx, imgIdx, distance, cc, stream);
|
||||
else
|
||||
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
|
||||
}
|
||||
|
||||
template void matchCollectionHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
template void matchCollectionHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
template void matchCollectionHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
template void matchCollectionHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
template void matchCollectionHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////
|
||||
//////////////////////////////////// Knn Match ////////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <typename VecDiff, typename Dist, typename T, typename Mask>
|
||||
__device__ void distanceCalcLoop(const PtrStep_<T>& query, const DevMem2D_<T>& train, const Mask& m, int queryIdx,
|
||||
typename Dist::result_type& distMin1, typename Dist::result_type& distMin2, int& bestTrainIdx1, int& bestTrainIdx2,
|
||||
typename Dist::result_type* smem)
|
||||
{
|
||||
const VecDiff vecDiff(query.ptr(queryIdx), train.cols, (typename Dist::value_type*)smem, threadIdx.y * blockDim.x + threadIdx.x, threadIdx.x);
|
||||
|
||||
typename Dist::result_type* sdiffRow = smem + blockDim.x * threadIdx.y;
|
||||
|
||||
distMin1 = numeric_limits<typename Dist::result_type>::max();
|
||||
distMin2 = numeric_limits<typename Dist::result_type>::max();
|
||||
|
||||
bestTrainIdx1 = -1;
|
||||
bestTrainIdx2 = -1;
|
||||
|
||||
for (int trainIdx = threadIdx.y; trainIdx < train.rows; trainIdx += blockDim.y)
|
||||
{
|
||||
if (m(queryIdx, trainIdx))
|
||||
{
|
||||
Dist dist;
|
||||
|
||||
const T* trainRow = train.ptr(trainIdx);
|
||||
|
||||
vecDiff.calc(trainRow, train.cols, dist, sdiffRow, threadIdx.x);
|
||||
|
||||
const typename Dist::result_type val = dist;
|
||||
|
||||
if (val < distMin1)
|
||||
{
|
||||
distMin1 = val;
|
||||
bestTrainIdx1 = trainIdx;
|
||||
}
|
||||
else if (val < distMin2)
|
||||
{
|
||||
distMin2 = val;
|
||||
bestTrainIdx2 = trainIdx;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename VecDiff, typename Dist, typename T, typename Mask>
|
||||
__global__ void knnMatch2(const PtrStep_<T> query, const DevMem2D_<T> train, const Mask m, int2* trainIdx, float2* distance)
|
||||
{
|
||||
typedef typename Dist::result_type result_type;
|
||||
typedef typename Dist::value_type value_type;
|
||||
|
||||
__shared__ result_type smem[BLOCK_DIM_X * BLOCK_DIM_Y];
|
||||
|
||||
const int queryIdx = blockIdx.x;
|
||||
|
||||
result_type distMin1;
|
||||
result_type distMin2;
|
||||
|
||||
int bestTrainIdx1;
|
||||
int bestTrainIdx2;
|
||||
|
||||
distanceCalcLoop<VecDiff, Dist>(query, train, m, queryIdx, distMin1, distMin2, bestTrainIdx1, bestTrainIdx2, smem);
|
||||
__syncthreads();
|
||||
|
||||
volatile result_type* sdistMinRow = smem;
|
||||
volatile int* sbestTrainIdxRow = (int*)(sdistMinRow + 2 * BLOCK_DIM_Y);
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
sdistMinRow[threadIdx.y] = distMin1;
|
||||
sdistMinRow[threadIdx.y + BLOCK_DIM_Y] = distMin2;
|
||||
|
||||
sbestTrainIdxRow[threadIdx.y] = bestTrainIdx1;
|
||||
sbestTrainIdxRow[threadIdx.y + BLOCK_DIM_Y] = bestTrainIdx2;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x == 0 && threadIdx.y == 0)
|
||||
{
|
||||
distMin1 = numeric_limits<result_type>::max();
|
||||
distMin2 = numeric_limits<result_type>::max();
|
||||
|
||||
bestTrainIdx1 = -1;
|
||||
bestTrainIdx2 = -1;
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < BLOCK_DIM_Y; ++i)
|
||||
{
|
||||
result_type val = sdistMinRow[i];
|
||||
|
||||
if (val < distMin1)
|
||||
{
|
||||
distMin1 = val;
|
||||
bestTrainIdx1 = sbestTrainIdxRow[i];
|
||||
}
|
||||
else if (val < distMin2)
|
||||
{
|
||||
distMin2 = val;
|
||||
bestTrainIdx2 = sbestTrainIdxRow[i];
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i = BLOCK_DIM_Y; i < 2 * BLOCK_DIM_Y; ++i)
|
||||
{
|
||||
result_type val = sdistMinRow[i];
|
||||
|
||||
if (val < distMin2)
|
||||
{
|
||||
distMin2 = val;
|
||||
bestTrainIdx2 = sbestTrainIdxRow[i];
|
||||
}
|
||||
}
|
||||
|
||||
trainIdx[queryIdx] = make_int2(bestTrainIdx1, bestTrainIdx2);
|
||||
distance[queryIdx] = make_float2(distMin1, distMin2);
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Knn 2 Match kernel caller
|
||||
|
||||
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
|
||||
void knnMatch2Simple_caller(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
|
||||
const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance,
|
||||
cudaStream_t stream)
|
||||
{
|
||||
const dim3 grid(query.rows, 1, 1);
|
||||
const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
|
||||
|
||||
knnMatch2<BLOCK_DIM_X, BLOCK_DIM_Y, VecDiffGlobal<BLOCK_DIM_X, T>, Dist, T>
|
||||
<<<grid, threads, 0, stream>>>(query, train, mask, trainIdx, distance);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename Dist, typename T, typename Mask>
|
||||
void knnMatch2Cached_caller(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
|
||||
const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance,
|
||||
cudaStream_t stream)
|
||||
{
|
||||
StaticAssert<BLOCK_DIM_X * BLOCK_DIM_Y >= MAX_LEN>::check(); // block size must be greter than descriptors length
|
||||
StaticAssert<MAX_LEN % BLOCK_DIM_X == 0>::check(); // max descriptors length must divide to blockDimX
|
||||
|
||||
const dim3 grid(query.rows, 1, 1);
|
||||
const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
|
||||
|
||||
knnMatch2<BLOCK_DIM_X, BLOCK_DIM_Y, VecDiffCachedRegister<BLOCK_DIM_X, MAX_LEN, LEN_EQ_MAX_LEN, typename Dist::value_type>, Dist, T>
|
||||
<<<grid, threads, 0, stream>>>(query, train, mask, trainIdx.data, distance.data);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Knn 2 Match Dispatcher
|
||||
|
||||
template <typename Dist, typename T, typename Mask>
|
||||
void knnMatch2Dispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
|
||||
const DevMem2D& trainIdx, const DevMem2D& distance,
|
||||
int cc, cudaStream_t stream)
|
||||
{
|
||||
if (query.cols < 64)
|
||||
{
|
||||
knnMatch2Cached_caller<16, 16, 64, false, Dist>(
|
||||
query, train, mask,
|
||||
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance),
|
||||
stream);
|
||||
}
|
||||
else if (query.cols == 64)
|
||||
{
|
||||
knnMatch2Cached_caller<16, 16, 64, true, Dist>(
|
||||
query, train, mask,
|
||||
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance),
|
||||
stream);
|
||||
}
|
||||
else if (query.cols < 128)
|
||||
{
|
||||
knnMatch2Cached_caller<16, 16, 128, false, Dist>(
|
||||
query, train, mask,
|
||||
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance),
|
||||
stream);
|
||||
}
|
||||
else if (query.cols == 128 && cc >= 12)
|
||||
{
|
||||
knnMatch2Cached_caller<16, 16, 128, true, Dist>(
|
||||
query, train, mask,
|
||||
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance),
|
||||
stream);
|
||||
}
|
||||
else if (query.cols < 256 && cc >= 12)
|
||||
{
|
||||
knnMatch2Cached_caller<16, 16, 256, false, Dist>(
|
||||
query, train, mask,
|
||||
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance),
|
||||
stream);
|
||||
}
|
||||
else if (query.cols == 256 && cc >= 12)
|
||||
{
|
||||
knnMatch2Cached_caller<16, 16, 256, true, Dist>(
|
||||
query, train, mask,
|
||||
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance),
|
||||
stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
knnMatch2Simple_caller<16, 16, Dist>(
|
||||
query, train, mask,
|
||||
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance),
|
||||
stream);
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Calc distance kernel
|
||||
|
||||
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
|
||||
__global__ void calcDistance(const PtrStep_<T> query, const DevMem2D_<T> train, const Mask mask, PtrStepf distance)
|
||||
{
|
||||
__shared__ typename Dist::result_type sdiff[BLOCK_DIM_X * BLOCK_DIM_Y];
|
||||
|
||||
typename Dist::result_type* sdiff_row = sdiff + BLOCK_DIM_X * threadIdx.y;
|
||||
|
||||
const int queryIdx = blockIdx.x;
|
||||
const T* queryDescs = query.ptr(queryIdx);
|
||||
|
||||
const int trainIdx = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;
|
||||
|
||||
if (trainIdx < train.rows)
|
||||
{
|
||||
const T* trainDescs = train.ptr(trainIdx);
|
||||
|
||||
typename Dist::result_type myDist = numeric_limits<typename Dist::result_type>::max();
|
||||
|
||||
if (mask(queryIdx, trainIdx))
|
||||
{
|
||||
Dist dist;
|
||||
|
||||
calcVecDiffGlobal<BLOCK_DIM_X>(queryDescs, trainDescs, train.cols, dist, sdiff_row, threadIdx.x);
|
||||
|
||||
myDist = dist;
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
distance.ptr(queryIdx)[trainIdx] = myDist;
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Calc distance kernel caller
|
||||
|
||||
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
|
||||
void calcDistance_caller(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, const DevMem2Df& distance, cudaStream_t stream)
|
||||
{
|
||||
const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
|
||||
const dim3 grid(query.rows, divUp(train.rows, BLOCK_DIM_Y), 1);
|
||||
|
||||
calcDistance<BLOCK_DIM_X, BLOCK_DIM_Y, Dist, T><<<grid, threads, 0, stream>>>(query, train, mask, distance);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template <typename Dist, typename T, typename Mask>
|
||||
void calcDistanceDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, const DevMem2D& allDist, cudaStream_t stream)
|
||||
{
|
||||
calcDistance_caller<16, 16, Dist>(query, train, mask, static_cast<DevMem2Df>(allDist), stream);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// find knn match kernel
|
||||
|
||||
template <int BLOCK_SIZE> __global__ void findBestMatch(DevMem2Df allDist_, int i, PtrStepi trainIdx_, PtrStepf distance_)
|
||||
{
|
||||
const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64;
|
||||
__shared__ float sdist[SMEM_SIZE];
|
||||
__shared__ int strainIdx[SMEM_SIZE];
|
||||
|
||||
const int queryIdx = blockIdx.x;
|
||||
|
||||
float* allDist = allDist_.ptr(queryIdx);
|
||||
int* trainIdx = trainIdx_.ptr(queryIdx);
|
||||
float* distance = distance_.ptr(queryIdx);
|
||||
|
||||
float dist = numeric_limits<float>::max();
|
||||
int bestIdx = -1;
|
||||
|
||||
for (int i = threadIdx.x; i < allDist_.cols; i += BLOCK_SIZE)
|
||||
{
|
||||
float reg = allDist[i];
|
||||
if (reg < dist)
|
||||
{
|
||||
dist = reg;
|
||||
bestIdx = i;
|
||||
}
|
||||
}
|
||||
|
||||
sdist[threadIdx.x] = dist;
|
||||
strainIdx[threadIdx.x] = bestIdx;
|
||||
__syncthreads();
|
||||
|
||||
reducePredVal<BLOCK_SIZE>(sdist, dist, strainIdx, bestIdx, threadIdx.x, less<volatile float>());
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
if (dist < numeric_limits<float>::max())
|
||||
{
|
||||
allDist[bestIdx] = numeric_limits<float>::max();
|
||||
trainIdx[i] = bestIdx;
|
||||
distance[i] = dist;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// find knn match kernel caller
|
||||
|
||||
template <int BLOCK_SIZE> void findKnnMatch_caller(int k, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream)
|
||||
{
|
||||
const dim3 threads(BLOCK_SIZE, 1, 1);
|
||||
const dim3 grid(trainIdx.rows, 1, 1);
|
||||
|
||||
for (int i = 0; i < k; ++i)
|
||||
{
|
||||
findBestMatch<BLOCK_SIZE><<<grid, threads, 0, stream>>>(allDist, i, trainIdx, distance);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
}
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
void findKnnMatchDispatcher(int k, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, cudaStream_t stream)
|
||||
{
|
||||
findKnnMatch_caller<256>(k, static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Df>(distance), static_cast<DevMem2Df>(allDist), stream);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// knn match Dispatcher
|
||||
|
||||
template <typename Dist, typename T>
|
||||
void knnMatchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, int k, const DevMem2D& mask,
|
||||
const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist,
|
||||
int cc, cudaStream_t stream)
|
||||
{
|
||||
if (mask.data)
|
||||
{
|
||||
if (k == 2)
|
||||
{
|
||||
knnMatch2Dispatcher<Dist>(query, train, SingleMask(mask), trainIdx, distance, cc, stream);
|
||||
return;
|
||||
}
|
||||
|
||||
calcDistanceDispatcher<Dist>(query, train, SingleMask(mask), allDist, stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (k == 2)
|
||||
{
|
||||
knnMatch2Dispatcher<Dist>(query, train, WithOutMask(), trainIdx, distance, cc, stream);
|
||||
return;
|
||||
}
|
||||
|
||||
calcDistanceDispatcher<Dist>(query, train, WithOutMask(), allDist, stream);
|
||||
}
|
||||
|
||||
findKnnMatchDispatcher(k, trainIdx, distance, allDist, stream);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// knn match caller
|
||||
|
||||
template <typename T> void knnMatchL1_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask,
|
||||
const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist,
|
||||
int cc, cudaStream_t stream)
|
||||
{
|
||||
knnMatchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, mask, trainIdx, distance, allDist, cc, stream);
|
||||
}
|
||||
|
||||
template void knnMatchL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
|
||||
template void knnMatchL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
|
||||
template void knnMatchL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
|
||||
template void knnMatchL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
|
||||
template void knnMatchL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
|
||||
template void knnMatchL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
|
||||
|
||||
template <typename T> void knnMatchL2_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask,
|
||||
const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist,
|
||||
int cc, cudaStream_t stream)
|
||||
{
|
||||
knnMatchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, mask, trainIdx, distance, allDist, cc, stream);
|
||||
}
|
||||
|
||||
template void knnMatchL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
|
||||
template void knnMatchL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
|
||||
template void knnMatchL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
|
||||
template void knnMatchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
|
||||
template void knnMatchL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
|
||||
template void knnMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
|
||||
|
||||
template <typename T> void knnMatchHamming_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask,
|
||||
const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist,
|
||||
int cc, cudaStream_t stream)
|
||||
{
|
||||
knnMatchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, mask, trainIdx, distance, allDist, cc, stream);
|
||||
}
|
||||
|
||||
template void knnMatchHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
|
||||
template void knnMatchHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
|
||||
template void knnMatchHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
|
||||
template void knnMatchHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
|
||||
template void knnMatchHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////
|
||||
/////////////////////////////////// Radius Match //////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
|
||||
__global__ void radiusMatch(const PtrStep_<T> query, const DevMem2D_<T> train, float maxDistance, const Mask mask,
|
||||
DevMem2Di trainIdx_, unsigned int* nMatches, PtrStepf distance)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 110
|
||||
|
||||
__shared__ typename Dist::result_type smem[BLOCK_DIM_X * BLOCK_DIM_Y];
|
||||
|
||||
typename Dist::result_type* sdiff_row = smem + BLOCK_DIM_X * threadIdx.y;
|
||||
|
||||
const int queryIdx = blockIdx.x;
|
||||
const T* queryDescs = query.ptr(queryIdx);
|
||||
|
||||
const int trainIdx = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;
|
||||
|
||||
if (trainIdx < train.rows)
|
||||
{
|
||||
const T* trainDescs = train.ptr(trainIdx);
|
||||
|
||||
if (mask(queryIdx, trainIdx))
|
||||
{
|
||||
Dist dist;
|
||||
|
||||
calcVecDiffGlobal<BLOCK_DIM_X>(queryDescs, trainDescs, train.cols, dist, sdiff_row, threadIdx.x);
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
if (dist < maxDistance)
|
||||
{
|
||||
unsigned int i = atomicInc(nMatches + queryIdx, (unsigned int) -1);
|
||||
if (i < trainIdx_.cols)
|
||||
{
|
||||
distance.ptr(queryIdx)[i] = dist;
|
||||
trainIdx_.ptr(queryIdx)[i] = trainIdx;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Radius Match kernel caller
|
||||
|
||||
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
|
||||
void radiusMatch_caller(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask,
|
||||
const DevMem2Di& trainIdx, const DevMem2D_<unsigned int>& nMatches, const DevMem2Df& distance,
|
||||
cudaStream_t stream)
|
||||
{
|
||||
const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
|
||||
const dim3 grid(query.rows, divUp(train.rows, BLOCK_DIM_Y), 1);
|
||||
|
||||
radiusMatch<BLOCK_DIM_X, BLOCK_DIM_Y, Dist, T><<<grid, threads, 0, stream>>>(query, train, maxDistance, mask, trainIdx, nMatches.data, distance);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Radius Match Dispatcher
|
||||
|
||||
template <typename Dist, typename T, typename Mask>
|
||||
void radiusMatchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask,
|
||||
const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance,
|
||||
cudaStream_t stream)
|
||||
{
|
||||
radiusMatch_caller<16, 16, Dist>(query, train, maxDistance, mask,
|
||||
static_cast<DevMem2Di>(trainIdx), static_cast< const DevMem2D_<unsigned int> >(nMatches), static_cast<DevMem2Df>(distance),
|
||||
stream);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Radius Match caller
|
||||
|
||||
template <typename T> void radiusMatchL1_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask,
|
||||
const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance,
|
||||
cudaStream_t stream)
|
||||
{
|
||||
if (mask.data)
|
||||
{
|
||||
radiusMatchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask),
|
||||
trainIdx, nMatches, distance,
|
||||
stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
radiusMatchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(),
|
||||
trainIdx, nMatches, distance,
|
||||
stream);
|
||||
}
|
||||
}
|
||||
|
||||
template void radiusMatchL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
|
||||
template void radiusMatchL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
|
||||
template void radiusMatchL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
|
||||
template void radiusMatchL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
|
||||
template void radiusMatchL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
|
||||
template void radiusMatchL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
|
||||
|
||||
template <typename T> void radiusMatchL2_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask,
|
||||
const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance,
|
||||
cudaStream_t stream)
|
||||
{
|
||||
if (mask.data)
|
||||
{
|
||||
radiusMatchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask),
|
||||
trainIdx, nMatches, distance,
|
||||
stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
radiusMatchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(),
|
||||
trainIdx, nMatches, distance,
|
||||
stream);
|
||||
}
|
||||
}
|
||||
|
||||
template void radiusMatchL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
|
||||
template void radiusMatchL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
|
||||
template void radiusMatchL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
|
||||
template void radiusMatchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
|
||||
template void radiusMatchL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
|
||||
template void radiusMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
|
||||
|
||||
template <typename T> void radiusMatchHamming_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask,
|
||||
const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance,
|
||||
cudaStream_t stream)
|
||||
{
|
||||
if (mask.data)
|
||||
{
|
||||
radiusMatchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask),
|
||||
trainIdx, nMatches, distance,
|
||||
stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
radiusMatchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(),
|
||||
trainIdx, nMatches, distance,
|
||||
stream);
|
||||
}
|
||||
}
|
||||
|
||||
template void radiusMatchHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
|
||||
template void radiusMatchHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
|
||||
template void radiusMatchHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
|
||||
template void radiusMatchHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
|
||||
template void radiusMatchHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
|
||||
}}}
|
240
modules/gpu/src/cuda/column_filter.cu
Normal file
240
modules/gpu/src/cuda/column_filter.cu
Normal file
@ -0,0 +1,240 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#include "internal_shared.hpp"
|
||||
#include "opencv2/gpu/device/saturate_cast.hpp"
|
||||
#include "opencv2/gpu/device/vec_math.hpp"
|
||||
#include "opencv2/gpu/device/limits.hpp"
|
||||
#include "opencv2/gpu/device/border_interpolate.hpp"
|
||||
|
||||
using namespace cv::gpu;
|
||||
using namespace cv::gpu::device;
|
||||
|
||||
#define MAX_KERNEL_SIZE 16
|
||||
#define BLOCK_DIM_X 16
|
||||
#define BLOCK_DIM_Y 16
|
||||
|
||||
namespace filter_krnls_column
|
||||
{
|
||||
__constant__ float cLinearKernel[MAX_KERNEL_SIZE];
|
||||
|
||||
void loadLinearKernel(const float kernel[], int ksize)
|
||||
{
|
||||
cudaSafeCall( cudaMemcpyToSymbol(cLinearKernel, kernel, ksize * sizeof(float)) );
|
||||
}
|
||||
|
||||
template <int ksize, typename T, typename D, typename B>
|
||||
__global__ void linearColumnFilter(const DevMem2D_<T> src, PtrStep_<D> dst, int anchor, const B b)
|
||||
{
|
||||
__shared__ T smem[BLOCK_DIM_Y * BLOCK_DIM_X * 3];
|
||||
|
||||
const int x = BLOCK_DIM_X * blockIdx.x + threadIdx.x;
|
||||
const int y = BLOCK_DIM_Y * blockIdx.y + threadIdx.y;
|
||||
|
||||
T* sDataColumn = smem + threadIdx.x;
|
||||
|
||||
if (x < src.cols)
|
||||
{
|
||||
const T* srcCol = src.ptr() + x;
|
||||
|
||||
sDataColumn[ threadIdx.y * BLOCK_DIM_X] = b.at_low(y - BLOCK_DIM_Y, srcCol, src.step);
|
||||
sDataColumn[(threadIdx.y + BLOCK_DIM_Y) * BLOCK_DIM_X] = b.at_high(y, srcCol, src.step);
|
||||
sDataColumn[(threadIdx.y + BLOCK_DIM_Y * 2) * BLOCK_DIM_X] = b.at_high(y + BLOCK_DIM_Y, srcCol, src.step);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (y < src.rows)
|
||||
{
|
||||
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
|
||||
sum_t sum = VecTraits<sum_t>::all(0);
|
||||
|
||||
sDataColumn += (threadIdx.y + BLOCK_DIM_Y - anchor) * BLOCK_DIM_X;
|
||||
|
||||
#pragma unroll
|
||||
for(int i = 0; i < ksize; ++i)
|
||||
sum = sum + sDataColumn[i * BLOCK_DIM_X] * cLinearKernel[i];
|
||||
|
||||
dst.ptr(y)[x] = saturate_cast<D>(sum);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
namespace cv { namespace gpu { namespace filters
|
||||
{
|
||||
template <int ksize, typename T, typename D, template<typename> class B>
|
||||
void linearColumnFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);
|
||||
dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
|
||||
|
||||
B<T> b(src.rows);
|
||||
|
||||
if (!b.is_range_safe(-BLOCK_DIM_Y, (grid.y + 1) * BLOCK_DIM_Y - 1))
|
||||
{
|
||||
cv::gpu::error("linearColumnFilter: can't use specified border extrapolation, image is too small, "
|
||||
"try bigger image or another border extrapolation mode", __FILE__, __LINE__);
|
||||
}
|
||||
|
||||
filter_krnls_column::linearColumnFilter<ksize, T, D><<<grid, threads, 0, stream>>>(src, dst, anchor, b);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template <typename T, typename D>
|
||||
void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream)
|
||||
{
|
||||
typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream);
|
||||
static const caller_t callers[5][17] =
|
||||
{
|
||||
{
|
||||
0,
|
||||
linearColumnFilter_caller<1 , T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<2 , T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<3 , T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<4 , T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<5 , T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<6 , T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<7 , T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<8 , T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<9 , T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<10, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<11, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<12, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<13, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<14, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<15, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<16, T, D, BrdColReflect101>
|
||||
},
|
||||
{
|
||||
0,
|
||||
linearColumnFilter_caller<1 , T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<2 , T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<3 , T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<4 , T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<5 , T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<6 , T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<7 , T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<8 , T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<9 , T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<10, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<11, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<12, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<13, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<14, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<15, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<16, T, D, BrdColReplicate>
|
||||
},
|
||||
{
|
||||
0,
|
||||
linearColumnFilter_caller<1 , T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<2 , T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<3 , T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<4 , T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<5 , T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<6 , T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<7 , T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<8 , T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<9 , T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<10, T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<11, T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<12, T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<13, T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<14, T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<15, T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<16, T, D, BrdColConstant>
|
||||
},
|
||||
{
|
||||
0,
|
||||
linearColumnFilter_caller<1 , T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<2 , T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<3 , T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<4 , T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<5 , T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<6 , T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<7 , T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<8 , T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<9 , T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<10, T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<11, T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<12, T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<13, T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<14, T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<15, T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<16, T, D, BrdColReflect>
|
||||
},
|
||||
{
|
||||
0,
|
||||
linearColumnFilter_caller<1 , T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<2 , T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<3 , T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<4 , T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<5 , T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<6 , T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<7 , T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<8 , T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<9 , T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<10, T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<11, T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<12, T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<13, T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<14, T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<15, T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<16, T, D, BrdColWrap>,
|
||||
}
|
||||
};
|
||||
|
||||
filter_krnls_column::loadLinearKernel(kernel, ksize);
|
||||
|
||||
callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);
|
||||
}
|
||||
|
||||
template void linearColumnFilter_gpu<float , uchar >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearColumnFilter_gpu<float4, uchar4>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
//template void linearColumnFilter_gpu<float , short >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
//template void linearColumnFilter_gpu<float2, short2>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearColumnFilter_gpu<float3, short3>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearColumnFilter_gpu<float , int >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearColumnFilter_gpu<float , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
}}}
|
@ -1,633 +0,0 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#include "internal_shared.hpp"
|
||||
#include "opencv2/gpu/device/saturate_cast.hpp"
|
||||
#include "opencv2/gpu/device/vec_math.hpp"
|
||||
#include "opencv2/gpu/device/limits.hpp"
|
||||
#include "opencv2/gpu/device/border_interpolate.hpp"
|
||||
|
||||
using namespace cv::gpu;
|
||||
using namespace cv::gpu::device;
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Linear filters
|
||||
|
||||
#define MAX_KERNEL_SIZE 16
|
||||
#define BLOCK_DIM_X 16
|
||||
#define BLOCK_DIM_Y 16
|
||||
|
||||
namespace filter_krnls
|
||||
{
|
||||
__constant__ float cLinearKernel[MAX_KERNEL_SIZE];
|
||||
}
|
||||
|
||||
namespace cv { namespace gpu { namespace filters
|
||||
{
|
||||
void loadLinearKernel(const float kernel[], int ksize)
|
||||
{
|
||||
cudaSafeCall( cudaMemcpyToSymbol(filter_krnls::cLinearKernel, kernel, ksize * sizeof(float)) );
|
||||
}
|
||||
}}}
|
||||
|
||||
namespace filter_krnls
|
||||
{
|
||||
template <typename T, size_t size> struct SmemType_
|
||||
{
|
||||
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type smem_t;
|
||||
};
|
||||
template <typename T> struct SmemType_<T, 4>
|
||||
{
|
||||
typedef T smem_t;
|
||||
};
|
||||
template <typename T> struct SmemType
|
||||
{
|
||||
typedef typename SmemType_<T, sizeof(T)>::smem_t smem_t;
|
||||
};
|
||||
|
||||
template <int ksize, typename T, typename D, typename B>
|
||||
__global__ void linearRowFilter(const DevMem2D_<T> src, PtrStep_<D> dst, int anchor, const B b)
|
||||
{
|
||||
typedef typename SmemType<T>::smem_t smem_t;
|
||||
|
||||
__shared__ smem_t smem[BLOCK_DIM_Y * BLOCK_DIM_X * 3];
|
||||
|
||||
const int x = BLOCK_DIM_X * blockIdx.x + threadIdx.x;
|
||||
const int y = BLOCK_DIM_Y * blockIdx.y + threadIdx.y;
|
||||
|
||||
smem_t* sDataRow = smem + threadIdx.y * BLOCK_DIM_X * 3;
|
||||
|
||||
if (y < src.rows)
|
||||
{
|
||||
const T* rowSrc = src.ptr(y);
|
||||
|
||||
sDataRow[threadIdx.x ] = b.at_low(x - BLOCK_DIM_X, rowSrc);
|
||||
sDataRow[threadIdx.x + BLOCK_DIM_X ] = b.at_high(x, rowSrc);
|
||||
sDataRow[threadIdx.x + BLOCK_DIM_X * 2] = b.at_high(x + BLOCK_DIM_X, rowSrc);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (x < src.cols)
|
||||
{
|
||||
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
|
||||
sum_t sum = VecTraits<sum_t>::all(0);
|
||||
|
||||
sDataRow += threadIdx.x + BLOCK_DIM_X - anchor;
|
||||
|
||||
#pragma unroll
|
||||
for(int i = 0; i < ksize; ++i)
|
||||
sum = sum + sDataRow[i] * cLinearKernel[i];
|
||||
|
||||
dst.ptr(y)[x] = saturate_cast<D>(sum);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
namespace cv { namespace gpu { namespace filters
|
||||
{
|
||||
template <int ksize, typename T, typename D, template<typename> class B>
|
||||
void linearRowFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);
|
||||
dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
|
||||
|
||||
typedef typename filter_krnls::SmemType<T>::smem_t smem_t;
|
||||
B<smem_t> b(src.cols);
|
||||
|
||||
if (!b.is_range_safe(-BLOCK_DIM_X, (grid.x + 1) * BLOCK_DIM_X - 1))
|
||||
{
|
||||
cv::gpu::error("linearRowFilter: can't use specified border extrapolation, image is too small, "
|
||||
"try bigger image or another border extrapolation mode", __FILE__, __LINE__);
|
||||
}
|
||||
|
||||
filter_krnls::linearRowFilter<ksize, T, D><<<grid, threads, 0, stream>>>(src, dst, anchor, b);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template <typename T, typename D>
|
||||
void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream)
|
||||
{
|
||||
typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream);
|
||||
static const caller_t callers[5][17] =
|
||||
{
|
||||
{
|
||||
0,
|
||||
linearRowFilter_caller<1 , T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<2 , T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<3 , T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<4 , T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<5 , T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<6 , T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<7 , T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<8 , T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<9 , T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<10, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<11, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<12, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<13, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<14, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<15, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<16, T, D, BrdRowReflect101>
|
||||
},
|
||||
{
|
||||
0,
|
||||
linearRowFilter_caller<1 , T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<2 , T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<3 , T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<4 , T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<5 , T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<6 , T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<7 , T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<8 , T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<9 , T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<10, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<11, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<12, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<13, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<14, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<15, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<16, T, D, BrdRowReplicate>
|
||||
},
|
||||
{
|
||||
0,
|
||||
linearRowFilter_caller<1 , T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<2 , T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<3 , T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<4 , T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<5 , T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<6 , T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<7 , T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<8 , T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<9 , T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<10, T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<11, T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<12, T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<13, T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<14, T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<15, T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<16, T, D, BrdRowConstant>
|
||||
},
|
||||
{
|
||||
0,
|
||||
linearRowFilter_caller<1 , T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<2 , T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<3 , T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<4 , T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<5 , T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<6 , T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<7 , T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<8 , T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<9 , T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<10, T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<11, T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<12, T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<13, T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<14, T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<15, T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<16, T, D, BrdRowReflect>
|
||||
},
|
||||
{
|
||||
0,
|
||||
linearRowFilter_caller<1 , T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<2 , T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<3 , T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<4 , T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<5 , T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<6 , T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<7 , T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<8 , T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<9 , T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<10, T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<11, T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<12, T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<13, T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<14, T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<15, T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<16, T, D, BrdRowWrap>
|
||||
}
|
||||
};
|
||||
|
||||
loadLinearKernel(kernel, ksize);
|
||||
|
||||
callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);
|
||||
}
|
||||
|
||||
template void linearRowFilter_gpu<uchar , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearRowFilter_gpu<uchar4, float4>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearRowFilter_gpu<short , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearRowFilter_gpu<short2, float2>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearRowFilter_gpu<short3, float3>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearRowFilter_gpu<int , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearRowFilter_gpu<float , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
}}}
|
||||
|
||||
namespace filter_krnls
|
||||
{
|
||||
template <int ksize, typename T, typename D, typename B>
|
||||
__global__ void linearColumnFilter(const DevMem2D_<T> src, PtrStep_<D> dst, int anchor, const B b)
|
||||
{
|
||||
__shared__ T smem[BLOCK_DIM_Y * BLOCK_DIM_X * 3];
|
||||
|
||||
const int x = BLOCK_DIM_X * blockIdx.x + threadIdx.x;
|
||||
const int y = BLOCK_DIM_Y * blockIdx.y + threadIdx.y;
|
||||
|
||||
T* sDataColumn = smem + threadIdx.x;
|
||||
|
||||
if (x < src.cols)
|
||||
{
|
||||
const T* srcCol = src.ptr() + x;
|
||||
|
||||
sDataColumn[ threadIdx.y * BLOCK_DIM_X] = b.at_low(y - BLOCK_DIM_Y, srcCol, src.step);
|
||||
sDataColumn[(threadIdx.y + BLOCK_DIM_Y) * BLOCK_DIM_X] = b.at_high(y, srcCol, src.step);
|
||||
sDataColumn[(threadIdx.y + BLOCK_DIM_Y * 2) * BLOCK_DIM_X] = b.at_high(y + BLOCK_DIM_Y, srcCol, src.step);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (y < src.rows)
|
||||
{
|
||||
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
|
||||
sum_t sum = VecTraits<sum_t>::all(0);
|
||||
|
||||
sDataColumn += (threadIdx.y + BLOCK_DIM_Y - anchor) * BLOCK_DIM_X;
|
||||
|
||||
#pragma unroll
|
||||
for(int i = 0; i < ksize; ++i)
|
||||
sum = sum + sDataColumn[i * BLOCK_DIM_X] * cLinearKernel[i];
|
||||
|
||||
dst.ptr(y)[x] = saturate_cast<D>(sum);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
namespace cv { namespace gpu { namespace filters
|
||||
{
|
||||
template <int ksize, typename T, typename D, template<typename> class B>
|
||||
void linearColumnFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);
|
||||
dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
|
||||
|
||||
B<T> b(src.rows);
|
||||
|
||||
if (!b.is_range_safe(-BLOCK_DIM_Y, (grid.y + 1) * BLOCK_DIM_Y - 1))
|
||||
{
|
||||
cv::gpu::error("linearColumnFilter: can't use specified border extrapolation, image is too small, "
|
||||
"try bigger image or another border extrapolation mode", __FILE__, __LINE__);
|
||||
}
|
||||
|
||||
filter_krnls::linearColumnFilter<ksize, T, D><<<grid, threads, 0, stream>>>(src, dst, anchor, b);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template <typename T, typename D>
|
||||
void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream)
|
||||
{
|
||||
typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream);
|
||||
static const caller_t callers[5][17] =
|
||||
{
|
||||
{
|
||||
0,
|
||||
linearColumnFilter_caller<1 , T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<2 , T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<3 , T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<4 , T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<5 , T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<6 , T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<7 , T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<8 , T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<9 , T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<10, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<11, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<12, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<13, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<14, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<15, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<16, T, D, BrdColReflect101>
|
||||
},
|
||||
{
|
||||
0,
|
||||
linearColumnFilter_caller<1 , T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<2 , T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<3 , T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<4 , T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<5 , T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<6 , T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<7 , T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<8 , T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<9 , T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<10, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<11, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<12, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<13, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<14, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<15, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<16, T, D, BrdColReplicate>
|
||||
},
|
||||
{
|
||||
0,
|
||||
linearColumnFilter_caller<1 , T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<2 , T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<3 , T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<4 , T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<5 , T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<6 , T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<7 , T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<8 , T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<9 , T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<10, T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<11, T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<12, T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<13, T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<14, T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<15, T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<16, T, D, BrdColConstant>
|
||||
},
|
||||
{
|
||||
0,
|
||||
linearColumnFilter_caller<1 , T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<2 , T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<3 , T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<4 , T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<5 , T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<6 , T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<7 , T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<8 , T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<9 , T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<10, T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<11, T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<12, T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<13, T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<14, T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<15, T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<16, T, D, BrdColReflect>
|
||||
},
|
||||
{
|
||||
0,
|
||||
linearColumnFilter_caller<1 , T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<2 , T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<3 , T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<4 , T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<5 , T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<6 , T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<7 , T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<8 , T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<9 , T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<10, T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<11, T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<12, T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<13, T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<14, T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<15, T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<16, T, D, BrdColWrap>,
|
||||
}
|
||||
};
|
||||
|
||||
loadLinearKernel(kernel, ksize);
|
||||
|
||||
callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);
|
||||
}
|
||||
|
||||
template void linearColumnFilter_gpu<float , uchar >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearColumnFilter_gpu<float4, uchar4>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearColumnFilter_gpu<float , short >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearColumnFilter_gpu<float2, short2>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearColumnFilter_gpu<float3, short3>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearColumnFilter_gpu<float , int >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearColumnFilter_gpu<float , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
}}}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Bilateral filters
|
||||
|
||||
namespace bf_krnls
|
||||
{
|
||||
__constant__ float* ctable_color;
|
||||
__constant__ float* ctable_space;
|
||||
__constant__ size_t ctable_space_step;
|
||||
|
||||
__constant__ int cndisp;
|
||||
__constant__ int cradius;
|
||||
|
||||
__constant__ short cedge_disc;
|
||||
__constant__ short cmax_disc;
|
||||
}
|
||||
|
||||
namespace cv { namespace gpu { namespace bf
|
||||
{
|
||||
void load_constants(float* table_color, const DevMem2Df& table_space, int ndisp, int radius, short edge_disc, short max_disc)
|
||||
{
|
||||
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_color, &table_color, sizeof(table_color)) );
|
||||
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space, &table_space.data, sizeof(table_space.data)) );
|
||||
size_t table_space_step = table_space.step / sizeof(float);
|
||||
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space_step, &table_space_step, sizeof(size_t)) );
|
||||
|
||||
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cndisp, &ndisp, sizeof(int)) );
|
||||
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cradius, &radius, sizeof(int)) );
|
||||
|
||||
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cedge_disc, &edge_disc, sizeof(short)) );
|
||||
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cmax_disc, &max_disc, sizeof(short)) );
|
||||
}
|
||||
}}}
|
||||
|
||||
namespace bf_krnls
|
||||
{
|
||||
template <int channels>
|
||||
struct DistRgbMax
|
||||
{
|
||||
static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
|
||||
{
|
||||
uchar x = abs(a[0] - b[0]);
|
||||
uchar y = abs(a[1] - b[1]);
|
||||
uchar z = abs(a[2] - b[2]);
|
||||
return (max(max(x, y), z));
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct DistRgbMax<1>
|
||||
{
|
||||
static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
|
||||
{
|
||||
return abs(a[0] - b[0]);
|
||||
}
|
||||
};
|
||||
|
||||
template <int channels, typename T>
|
||||
__global__ void bilateral_filter(int t, T* disp, size_t disp_step, const uchar* img, size_t img_step, int h, int w)
|
||||
{
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);
|
||||
|
||||
T dp[5];
|
||||
|
||||
if (y > 0 && y < h - 1 && x > 0 && x < w - 1)
|
||||
{
|
||||
dp[0] = *(disp + (y ) * disp_step + x + 0);
|
||||
dp[1] = *(disp + (y-1) * disp_step + x + 0);
|
||||
dp[2] = *(disp + (y ) * disp_step + x - 1);
|
||||
dp[3] = *(disp + (y+1) * disp_step + x + 0);
|
||||
dp[4] = *(disp + (y ) * disp_step + x + 1);
|
||||
|
||||
if(abs(dp[1] - dp[0]) >= cedge_disc || abs(dp[2] - dp[0]) >= cedge_disc || abs(dp[3] - dp[0]) >= cedge_disc || abs(dp[4] - dp[0]) >= cedge_disc)
|
||||
{
|
||||
const int ymin = max(0, y - cradius);
|
||||
const int xmin = max(0, x - cradius);
|
||||
const int ymax = min(h - 1, y + cradius);
|
||||
const int xmax = min(w - 1, x + cradius);
|
||||
|
||||
float cost[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
|
||||
|
||||
const uchar* ic = img + y * img_step + channels * x;
|
||||
|
||||
for(int yi = ymin; yi <= ymax; yi++)
|
||||
{
|
||||
const T* disp_y = disp + yi * disp_step;
|
||||
|
||||
for(int xi = xmin; xi <= xmax; xi++)
|
||||
{
|
||||
const uchar* in = img + yi * img_step + channels * xi;
|
||||
|
||||
uchar dist_rgb = DistRgbMax<channels>::calc(in, ic);
|
||||
|
||||
const float weight = ctable_color[dist_rgb] * (ctable_space + abs(y-yi)* ctable_space_step)[abs(x-xi)];
|
||||
|
||||
const T disp_reg = disp_y[xi];
|
||||
|
||||
cost[0] += min(cmax_disc, abs(disp_reg - dp[0])) * weight;
|
||||
cost[1] += min(cmax_disc, abs(disp_reg - dp[1])) * weight;
|
||||
cost[2] += min(cmax_disc, abs(disp_reg - dp[2])) * weight;
|
||||
cost[3] += min(cmax_disc, abs(disp_reg - dp[3])) * weight;
|
||||
cost[4] += min(cmax_disc, abs(disp_reg - dp[4])) * weight;
|
||||
}
|
||||
}
|
||||
|
||||
float minimum = numeric_limits<float>::max();
|
||||
int id = 0;
|
||||
|
||||
if (cost[0] < minimum)
|
||||
{
|
||||
minimum = cost[0];
|
||||
id = 0;
|
||||
}
|
||||
if (cost[1] < minimum)
|
||||
{
|
||||
minimum = cost[1];
|
||||
id = 1;
|
||||
}
|
||||
if (cost[2] < minimum)
|
||||
{
|
||||
minimum = cost[2];
|
||||
id = 2;
|
||||
}
|
||||
if (cost[3] < minimum)
|
||||
{
|
||||
minimum = cost[3];
|
||||
id = 3;
|
||||
}
|
||||
if (cost[4] < minimum)
|
||||
{
|
||||
minimum = cost[4];
|
||||
id = 4;
|
||||
}
|
||||
|
||||
*(disp + y * disp_step + x) = dp[id];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
namespace cv { namespace gpu { namespace bf
|
||||
{
|
||||
template <typename T>
|
||||
void bilateral_filter_caller(const DevMem2D_<T>& disp, const DevMem2D& img, int channels, int iters, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(32, 8, 1);
|
||||
dim3 grid(1, 1, 1);
|
||||
grid.x = divUp(disp.cols, threads.x << 1);
|
||||
grid.y = divUp(disp.rows, threads.y);
|
||||
|
||||
switch (channels)
|
||||
{
|
||||
case 1:
|
||||
for (int i = 0; i < iters; ++i)
|
||||
{
|
||||
bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
}
|
||||
break;
|
||||
case 3:
|
||||
for (int i = 0; i < iters; ++i)
|
||||
{
|
||||
bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
}
|
||||
break;
|
||||
default:
|
||||
cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);
|
||||
}
|
||||
|
||||
if (stream != 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
void bilateral_filter_gpu(const DevMem2D& disp, const DevMem2D& img, int channels, int iters, cudaStream_t stream)
|
||||
{
|
||||
bilateral_filter_caller(disp, img, channels, iters, stream);
|
||||
}
|
||||
|
||||
void bilateral_filter_gpu(const DevMem2D_<short>& disp, const DevMem2D& img, int channels, int iters, cudaStream_t stream)
|
||||
{
|
||||
bilateral_filter_caller(disp, img, channels, iters, stream);
|
||||
}
|
||||
}}}
|
@ -41,433 +41,16 @@
|
||||
//M*/
|
||||
|
||||
#include "internal_shared.hpp"
|
||||
#include "opencv2/gpu/device/border_interpolate.hpp"
|
||||
#include "opencv2/gpu/device/vec_traits.hpp"
|
||||
#include "opencv2/gpu/device/vec_math.hpp"
|
||||
#include "opencv2/gpu/device/saturate_cast.hpp"
|
||||
#include "opencv2/gpu/device/filters.hpp"
|
||||
#include "opencv2/gpu/device/border_interpolate.hpp"
|
||||
|
||||
using namespace cv::gpu;
|
||||
using namespace cv::gpu::device;
|
||||
|
||||
/////////////////////////////////// Remap ///////////////////////////////////////////////
|
||||
namespace cv { namespace gpu { namespace imgproc
|
||||
{
|
||||
template <typename Ptr2D, typename T> __global__ void remap(const Ptr2D src, const PtrStepf mapx, const PtrStepf mapy, DevMem2D_<T> dst)
|
||||
{
|
||||
const int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
const int y = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
|
||||
if (x < dst.cols && y < dst.rows)
|
||||
{
|
||||
const float xcoo = mapx.ptr(y)[x];
|
||||
const float ycoo = mapy.ptr(y)[x];
|
||||
|
||||
dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));
|
||||
}
|
||||
}
|
||||
|
||||
template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherStream
|
||||
{
|
||||
static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, const float* borderValue, cudaStream_t stream)
|
||||
{
|
||||
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
|
||||
|
||||
dim3 block(32, 8);
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
|
||||
|
||||
B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
|
||||
BorderReader< PtrStep_<T>, B<work_type> > brdSrc(src, brd);
|
||||
Filter< BorderReader< PtrStep_<T>, B<work_type> > > filter_src(brdSrc);
|
||||
|
||||
remap<<<grid, block, 0, stream>>>(filter_src, mapx, mapy, dst);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
}
|
||||
};
|
||||
|
||||
template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherNonStream
|
||||
{
|
||||
static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, const float* borderValue)
|
||||
{
|
||||
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
|
||||
|
||||
dim3 block(32, 8);
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
|
||||
|
||||
B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
|
||||
BorderReader< PtrStep_<T>, B<work_type> > brdSrc(src, brd);
|
||||
Filter< BorderReader< PtrStep_<T>, B<work_type> > > filter_src(brdSrc);
|
||||
|
||||
remap<<<grid, block>>>(filter_src, mapx, mapy, dst);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
};
|
||||
|
||||
#define OPENCV_GPU_IMPLEMENT_REMAP_TEX(type) \
|
||||
texture< type , cudaTextureType2D> tex_remap_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
|
||||
struct tex_remap_ ## type ## _reader \
|
||||
{ \
|
||||
typedef type elem_type; \
|
||||
typedef int index_type; \
|
||||
__device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
|
||||
{ \
|
||||
return tex2D(tex_remap_ ## type , x, y); \
|
||||
} \
|
||||
}; \
|
||||
template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, type> \
|
||||
{ \
|
||||
static void call(const DevMem2D_< type >& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_< type >& dst, const float* borderValue) \
|
||||
{ \
|
||||
typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
|
||||
dim3 block(32, 8); \
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
|
||||
TextureBinder texHandler(&tex_remap_ ## type , src); \
|
||||
tex_remap_ ## type ##_reader texSrc; \
|
||||
B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \
|
||||
BorderReader< tex_remap_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \
|
||||
Filter< BorderReader< tex_remap_ ## type ##_reader, B<work_type> > > filter_src(brdSrc); \
|
||||
remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
|
||||
cudaSafeCall( cudaGetLastError() ); \
|
||||
cudaSafeCall( cudaDeviceSynchronize() ); \
|
||||
} \
|
||||
}; \
|
||||
template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, type> \
|
||||
{ \
|
||||
static void call(const DevMem2D_< type >& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_< type >& dst, const float*) \
|
||||
{ \
|
||||
dim3 block(32, 8); \
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
|
||||
TextureBinder texHandler(&tex_remap_ ## type , src); \
|
||||
tex_remap_ ## type ##_reader texSrc; \
|
||||
Filter< tex_remap_ ## type ##_reader > filter_src(texSrc); \
|
||||
remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
|
||||
cudaSafeCall( cudaGetLastError() ); \
|
||||
cudaSafeCall( cudaDeviceSynchronize() ); \
|
||||
} \
|
||||
};
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar)
|
||||
OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar2)
|
||||
OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar4)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_REMAP_TEX(schar)
|
||||
OPENCV_GPU_IMPLEMENT_REMAP_TEX(char2)
|
||||
OPENCV_GPU_IMPLEMENT_REMAP_TEX(char4)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort)
|
||||
OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort2)
|
||||
OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort4)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_REMAP_TEX(short)
|
||||
OPENCV_GPU_IMPLEMENT_REMAP_TEX(short2)
|
||||
OPENCV_GPU_IMPLEMENT_REMAP_TEX(short4)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_REMAP_TEX(int)
|
||||
OPENCV_GPU_IMPLEMENT_REMAP_TEX(int2)
|
||||
OPENCV_GPU_IMPLEMENT_REMAP_TEX(int4)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_REMAP_TEX(float)
|
||||
OPENCV_GPU_IMPLEMENT_REMAP_TEX(float2)
|
||||
OPENCV_GPU_IMPLEMENT_REMAP_TEX(float4)
|
||||
|
||||
#undef OPENCV_GPU_IMPLEMENT_REMAP_TEX
|
||||
|
||||
template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcher
|
||||
{
|
||||
static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, const float* borderValue, cudaStream_t stream)
|
||||
{
|
||||
if (stream == 0)
|
||||
RemapDispatcherNonStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue);
|
||||
else
|
||||
RemapDispatcherStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, stream);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T> void remap_gpu(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream)
|
||||
{
|
||||
typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D_<T>& dst, const float* borderValue, cudaStream_t stream);
|
||||
|
||||
static const caller_t callers[3][5] =
|
||||
{
|
||||
{
|
||||
RemapDispatcher<PointFilter, BrdReflect101, T>::call,
|
||||
RemapDispatcher<PointFilter, BrdReplicate, T>::call,
|
||||
RemapDispatcher<PointFilter, BrdConstant, T>::call,
|
||||
RemapDispatcher<PointFilter, BrdReflect, T>::call,
|
||||
RemapDispatcher<PointFilter, BrdWrap, T>::call
|
||||
},
|
||||
{
|
||||
RemapDispatcher<LinearFilter, BrdReflect101, T>::call,
|
||||
RemapDispatcher<LinearFilter, BrdReplicate, T>::call,
|
||||
RemapDispatcher<LinearFilter, BrdConstant, T>::call,
|
||||
RemapDispatcher<LinearFilter, BrdReflect, T>::call,
|
||||
RemapDispatcher<LinearFilter, BrdWrap, T>::call
|
||||
},
|
||||
{
|
||||
RemapDispatcher<CubicFilter, BrdReflect101, T>::call,
|
||||
RemapDispatcher<CubicFilter, BrdReplicate, T>::call,
|
||||
RemapDispatcher<CubicFilter, BrdConstant, T>::call,
|
||||
RemapDispatcher<CubicFilter, BrdReflect, T>::call,
|
||||
RemapDispatcher<CubicFilter, BrdWrap, T>::call
|
||||
}
|
||||
};
|
||||
|
||||
callers[interpolation][borderMode](static_cast< DevMem2D_<T> >(src), xmap, ymap, static_cast< DevMem2D_<T> >(dst), borderValue, stream);
|
||||
}
|
||||
|
||||
template void remap_gpu<uchar >(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
template void remap_gpu<uchar2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
template void remap_gpu<uchar3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
template void remap_gpu<uchar4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
|
||||
template void remap_gpu<schar>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
template void remap_gpu<char2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
template void remap_gpu<char3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
template void remap_gpu<char4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
|
||||
template void remap_gpu<ushort >(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
template void remap_gpu<ushort2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
template void remap_gpu<ushort3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
template void remap_gpu<ushort4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
|
||||
template void remap_gpu<short >(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
template void remap_gpu<short2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
template void remap_gpu<short3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
template void remap_gpu<short4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
|
||||
template void remap_gpu<uint >(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
template void remap_gpu<uint2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
template void remap_gpu<uint3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
template void remap_gpu<uint4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
|
||||
template void remap_gpu<int >(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
template void remap_gpu<int2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
template void remap_gpu<int3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
template void remap_gpu<int4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
|
||||
template void remap_gpu<float >(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
template void remap_gpu<float2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
template void remap_gpu<float3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
template void remap_gpu<float4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
|
||||
/////////////////////////////////// Resize ///////////////////////////////////////////////
|
||||
|
||||
template <typename Ptr2D, typename T> __global__ void resize(const Ptr2D src, float fx, float fy, DevMem2D_<T> dst)
|
||||
{
|
||||
const int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
const int y = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
|
||||
if (x < dst.cols && y < dst.rows)
|
||||
{
|
||||
const float xcoo = x / fx;
|
||||
const float ycoo = y / fy;
|
||||
|
||||
dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));
|
||||
}
|
||||
}
|
||||
template <typename Ptr2D, typename T> __global__ void resizeNN(const Ptr2D src, float fx, float fy, DevMem2D_<T> dst)
|
||||
{
|
||||
const int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
const int y = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
|
||||
if (x < dst.cols && y < dst.rows)
|
||||
{
|
||||
const float xcoo = x / fx;
|
||||
const float ycoo = y / fy;
|
||||
|
||||
dst.ptr(y)[x] = src(__float2int_rd(ycoo), __float2int_rd(xcoo));
|
||||
}
|
||||
}
|
||||
|
||||
template <template <typename> class Filter, typename T> struct ResizeDispatcherStream
|
||||
{
|
||||
static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)
|
||||
{
|
||||
dim3 block(32, 8);
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
|
||||
|
||||
BrdReplicate<T> brd(src.rows, src.cols);
|
||||
BorderReader< PtrStep_<T>, BrdReplicate<T> > brdSrc(src, brd);
|
||||
Filter< BorderReader< PtrStep_<T>, BrdReplicate<T> > > filter_src(brdSrc);
|
||||
|
||||
resize<<<grid, block, 0, stream>>>(filter_src, fx, fy, dst);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
}
|
||||
};
|
||||
template <typename T> struct ResizeDispatcherStream<PointFilter, T>
|
||||
{
|
||||
static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)
|
||||
{
|
||||
dim3 block(32, 8);
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
|
||||
|
||||
BrdReplicate<T> brd(src.rows, src.cols);
|
||||
BorderReader< PtrStep_<T>, BrdReplicate<T> > brdSrc(src, brd);
|
||||
|
||||
resizeNN<<<grid, block, 0, stream>>>(brdSrc, fx, fy, dst);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
}
|
||||
};
|
||||
|
||||
template <template <typename> class Filter, typename T> struct ResizeDispatcherNonStream
|
||||
{
|
||||
static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst)
|
||||
{
|
||||
dim3 block(32, 8);
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
|
||||
|
||||
BrdReplicate<T> brd(src.rows, src.cols);
|
||||
BorderReader< PtrStep_<T>, BrdReplicate<T> > brdSrc(src, brd);
|
||||
Filter< BorderReader< PtrStep_<T>, BrdReplicate<T> > > filter_src(brdSrc);
|
||||
|
||||
resize<<<grid, block>>>(filter_src, fx, fy, dst);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
};
|
||||
template <typename T> struct ResizeDispatcherNonStream<PointFilter, T>
|
||||
{
|
||||
static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst)
|
||||
{
|
||||
dim3 block(32, 8);
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
|
||||
|
||||
BrdReplicate<T> brd(src.rows, src.cols);
|
||||
BorderReader< PtrStep_<T>, BrdReplicate<T> > brdSrc(src, brd);
|
||||
|
||||
resizeNN<<<grid, block>>>(brdSrc, fx, fy, dst);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
};
|
||||
|
||||
#define OPENCV_GPU_IMPLEMENT_RESIZE_TEX(type) \
|
||||
texture< type , cudaTextureType2D> tex_resize_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
|
||||
struct tex_resize_ ## type ## _reader \
|
||||
{ \
|
||||
typedef type elem_type; \
|
||||
typedef int index_type; \
|
||||
__device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
|
||||
{ \
|
||||
return tex2D(tex_resize_ ## type , x, y); \
|
||||
} \
|
||||
}; \
|
||||
template <template <typename> class Filter> struct ResizeDispatcherNonStream<Filter, type> \
|
||||
{ \
|
||||
static void call(const DevMem2D_< type >& src, float fx, float fy, const DevMem2D_< type >& dst) \
|
||||
{ \
|
||||
dim3 block(32, 8); \
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
|
||||
TextureBinder texHandler(&tex_resize_ ## type , src); \
|
||||
tex_resize_ ## type ##_reader texSrc; \
|
||||
Filter< tex_resize_ ## type ##_reader > filter_src(texSrc); \
|
||||
resize<<<grid, block>>>(filter_src, fx, fy, dst); \
|
||||
cudaSafeCall( cudaGetLastError() ); \
|
||||
cudaSafeCall( cudaDeviceSynchronize() ); \
|
||||
} \
|
||||
}; \
|
||||
template <> struct ResizeDispatcherNonStream<PointFilter, type> \
|
||||
{ \
|
||||
static void call(const DevMem2D_< type >& src, float fx, float fy, const DevMem2D_< type >& dst) \
|
||||
{ \
|
||||
dim3 block(32, 8); \
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
|
||||
TextureBinder texHandler(&tex_resize_ ## type , src); \
|
||||
tex_resize_ ## type ##_reader texSrc; \
|
||||
resizeNN<<<grid, block>>>(texSrc, fx, fy, dst); \
|
||||
cudaSafeCall( cudaGetLastError() ); \
|
||||
cudaSafeCall( cudaDeviceSynchronize() ); \
|
||||
} \
|
||||
};
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar)
|
||||
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar2)
|
||||
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar4)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(schar)
|
||||
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char2)
|
||||
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char4)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort)
|
||||
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort2)
|
||||
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort4)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short)
|
||||
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short2)
|
||||
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short4)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int)
|
||||
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int2)
|
||||
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int4)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float)
|
||||
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float2)
|
||||
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float4)
|
||||
|
||||
#undef OPENCV_GPU_IMPLEMENT_RESIZE_TEX
|
||||
|
||||
template <template <typename> class Filter, typename T> struct ResizeDispatcher
|
||||
{
|
||||
static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)
|
||||
{
|
||||
if (stream == 0)
|
||||
ResizeDispatcherNonStream<Filter, T>::call(src, fx, fy, dst);
|
||||
else
|
||||
ResizeDispatcherStream<Filter, T>::call(src, fx, fy, dst, stream);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T> void resize_gpu(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream)
|
||||
{
|
||||
typedef void (*caller_t)(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream);
|
||||
|
||||
static const caller_t callers[3] =
|
||||
{
|
||||
ResizeDispatcher<PointFilter, T>::call, ResizeDispatcher<LinearFilter, T>::call, ResizeDispatcher<CubicFilter, T>::call
|
||||
};
|
||||
|
||||
callers[interpolation](static_cast< DevMem2D_<T> >(src), fx, fy, static_cast< DevMem2D_<T> >(dst), stream);
|
||||
}
|
||||
|
||||
template void resize_gpu<uchar >(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
|
||||
template void resize_gpu<uchar2>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
|
||||
template void resize_gpu<uchar3>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
|
||||
template void resize_gpu<uchar4>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
|
||||
|
||||
template void resize_gpu<schar>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
|
||||
template void resize_gpu<char2>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
|
||||
template void resize_gpu<char3>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
|
||||
template void resize_gpu<char4>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
|
||||
|
||||
template void resize_gpu<ushort >(const DevMem2D& src,float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
|
||||
template void resize_gpu<ushort2>(const DevMem2D& src,float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
|
||||
template void resize_gpu<ushort3>(const DevMem2D& src,float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
|
||||
template void resize_gpu<ushort4>(const DevMem2D& src,float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
|
||||
|
||||
template void resize_gpu<short >(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
|
||||
template void resize_gpu<short2>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
|
||||
template void resize_gpu<short3>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
|
||||
template void resize_gpu<short4>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
|
||||
|
||||
template void resize_gpu<uint >(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
|
||||
template void resize_gpu<uint2>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
|
||||
template void resize_gpu<uint3>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
|
||||
template void resize_gpu<uint4>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
|
||||
|
||||
template void resize_gpu<int >(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
|
||||
template void resize_gpu<int2>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
|
||||
template void resize_gpu<int3>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
|
||||
template void resize_gpu<int4>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
|
||||
|
||||
template void resize_gpu<float >(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
|
||||
template void resize_gpu<float2>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
|
||||
template void resize_gpu<float3>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
|
||||
template void resize_gpu<float4>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
|
||||
|
||||
/////////////////////////////////// MeanShiftfiltering ///////////////////////////////////////////////
|
||||
|
||||
texture<uchar4, 2> tex_meanshift;
|
||||
@ -1199,363 +782,7 @@ namespace cv { namespace gpu { namespace imgproc
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
// downsample
|
||||
|
||||
template <typename T, int cn>
|
||||
__global__ void downsampleKernel(const PtrStep_<T> src, DevMem2D_<T> dst)
|
||||
{
|
||||
int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (x < dst.cols && y < dst.rows)
|
||||
{
|
||||
int ch_x = x / cn;
|
||||
dst.ptr(y)[x] = src.ptr(y*2)[ch_x*2*cn + x - ch_x*cn];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename T, int cn>
|
||||
void downsampleCaller(const DevMem2D src, DevMem2D dst, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(32, 8);
|
||||
dim3 grid(divUp(dst.cols, threads.x), divUp(dst.rows, threads.y));
|
||||
|
||||
downsampleKernel<T,cn><<<grid, threads, 0, stream>>>(DevMem2D_<T>(src), DevMem2D_<T>(dst));
|
||||
cudaSafeCall(cudaGetLastError());
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
|
||||
template void downsampleCaller<uchar,1>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
|
||||
template void downsampleCaller<uchar,2>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
|
||||
template void downsampleCaller<uchar,3>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
|
||||
template void downsampleCaller<uchar,4>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
|
||||
template void downsampleCaller<short,1>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
|
||||
template void downsampleCaller<short,2>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
|
||||
template void downsampleCaller<short,3>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
|
||||
template void downsampleCaller<short,4>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
|
||||
template void downsampleCaller<float,1>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
|
||||
template void downsampleCaller<float,2>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
|
||||
template void downsampleCaller<float,3>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
|
||||
template void downsampleCaller<float,4>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
// upsample
|
||||
|
||||
template <typename T, int cn>
|
||||
__global__ void upsampleKernel(const PtrStep_<T> src, DevMem2D_<T> dst)
|
||||
{
|
||||
int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (x < dst.cols && y < dst.rows)
|
||||
{
|
||||
int ch_x = x / cn;
|
||||
T val = ((ch_x & 1) || (y & 1)) ? 0 : src.ptr(y/2)[ch_x/2*cn + x - ch_x*cn];
|
||||
dst.ptr(y)[x] = val;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename T, int cn>
|
||||
void upsampleCaller(const DevMem2D src, DevMem2D dst, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(32, 8);
|
||||
dim3 grid(divUp(dst.cols, threads.x), divUp(dst.rows, threads.y));
|
||||
|
||||
upsampleKernel<T,cn><<<grid, threads, 0, stream>>>(DevMem2D_<T>(src), DevMem2D_<T>(dst));
|
||||
cudaSafeCall(cudaGetLastError());
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
|
||||
template void upsampleCaller<uchar,1>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
|
||||
template void upsampleCaller<uchar,2>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
|
||||
template void upsampleCaller<uchar,3>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
|
||||
template void upsampleCaller<uchar,4>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
|
||||
template void upsampleCaller<short,1>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
|
||||
template void upsampleCaller<short,2>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
|
||||
template void upsampleCaller<short,3>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
|
||||
template void upsampleCaller<short,4>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
|
||||
template void upsampleCaller<float,1>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
|
||||
template void upsampleCaller<float,2>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
|
||||
template void upsampleCaller<float,3>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
|
||||
template void upsampleCaller<float,4>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
// pyrDown
|
||||
|
||||
template <typename T, typename B> __global__ void pyrDown(const PtrStep_<T> src, PtrStep_<T> dst, const B b, int dst_cols)
|
||||
{
|
||||
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
|
||||
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y;
|
||||
|
||||
__shared__ value_type smem[256 + 4];
|
||||
|
||||
value_type sum;
|
||||
|
||||
const int src_y = 2*y;
|
||||
|
||||
sum = VecTraits<value_type>::all(0);
|
||||
|
||||
sum = sum + 0.0625f * b.at(src_y - 2, x, src.data, src.step);
|
||||
sum = sum + 0.25f * b.at(src_y - 1, x, src.data, src.step);
|
||||
sum = sum + 0.375f * b.at(src_y , x, src.data, src.step);
|
||||
sum = sum + 0.25f * b.at(src_y + 1, x, src.data, src.step);
|
||||
sum = sum + 0.0625f * b.at(src_y + 2, x, src.data, src.step);
|
||||
|
||||
smem[2 + threadIdx.x] = sum;
|
||||
|
||||
if (threadIdx.x < 2)
|
||||
{
|
||||
const int left_x = x - 2 + threadIdx.x;
|
||||
|
||||
sum = VecTraits<value_type>::all(0);
|
||||
|
||||
sum = sum + 0.0625f * b.at(src_y - 2, left_x, src.data, src.step);
|
||||
sum = sum + 0.25f * b.at(src_y - 1, left_x, src.data, src.step);
|
||||
sum = sum + 0.375f * b.at(src_y , left_x, src.data, src.step);
|
||||
sum = sum + 0.25f * b.at(src_y + 1, left_x, src.data, src.step);
|
||||
sum = sum + 0.0625f * b.at(src_y + 2, left_x, src.data, src.step);
|
||||
|
||||
smem[threadIdx.x] = sum;
|
||||
}
|
||||
|
||||
if (threadIdx.x > 253)
|
||||
{
|
||||
const int right_x = x + threadIdx.x + 2;
|
||||
|
||||
sum = VecTraits<value_type>::all(0);
|
||||
|
||||
sum = sum + 0.0625f * b.at(src_y - 2, right_x, src.data, src.step);
|
||||
sum = sum + 0.25f * b.at(src_y - 1, right_x, src.data, src.step);
|
||||
sum = sum + 0.375f * b.at(src_y , right_x, src.data, src.step);
|
||||
sum = sum + 0.25f * b.at(src_y + 1, right_x, src.data, src.step);
|
||||
sum = sum + 0.0625f * b.at(src_y + 2, right_x, src.data, src.step);
|
||||
|
||||
smem[4 + threadIdx.x] = sum;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x < 128)
|
||||
{
|
||||
const int tid2 = threadIdx.x * 2;
|
||||
|
||||
sum = VecTraits<value_type>::all(0);
|
||||
|
||||
sum = sum + 0.0625f * smem[2 + tid2 - 2];
|
||||
sum = sum + 0.25f * smem[2 + tid2 - 1];
|
||||
sum = sum + 0.375f * smem[2 + tid2 ];
|
||||
sum = sum + 0.25f * smem[2 + tid2 + 1];
|
||||
sum = sum + 0.0625f * smem[2 + tid2 + 2];
|
||||
|
||||
const int dst_x = (blockIdx.x * blockDim.x + tid2) / 2;
|
||||
|
||||
if (dst_x < dst_cols)
|
||||
dst.ptr(y)[dst_x] = saturate_cast<T>(sum);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, template <typename> class B> void pyrDown_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream)
|
||||
{
|
||||
const dim3 block(256);
|
||||
const dim3 grid(divUp(src.cols, block.x), dst.rows);
|
||||
|
||||
B<T> b(src.rows, src.cols);
|
||||
|
||||
pyrDown<T><<<grid, block, 0, stream>>>(src, dst, b, dst.cols);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template <typename T, int cn> void pyrDown_gpu(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream)
|
||||
{
|
||||
typedef typename TypeVec<T, cn>::vec_type type;
|
||||
|
||||
typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream);
|
||||
|
||||
static const caller_t callers[] =
|
||||
{
|
||||
pyrDown_caller<type, BrdReflect101>, pyrDown_caller<type, BrdReplicate>, pyrDown_caller<type, BrdConstant>, pyrDown_caller<type, BrdReflect>, pyrDown_caller<type, BrdWrap>
|
||||
};
|
||||
|
||||
callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream);
|
||||
}
|
||||
|
||||
template void pyrDown_gpu<uchar, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<uchar, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<uchar, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<uchar, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
|
||||
template void pyrDown_gpu<schar, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<schar, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<schar, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<schar, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
|
||||
template void pyrDown_gpu<ushort, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<ushort, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<ushort, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<ushort, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
|
||||
template void pyrDown_gpu<short, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<short, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<short, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<short, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
|
||||
template void pyrDown_gpu<int, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<int, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<int, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<int, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
|
||||
template void pyrDown_gpu<float, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<float, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<float, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<float, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
// pyrUp
|
||||
|
||||
template <typename T, typename B> __global__ void pyrUp(const PtrStep_<T> src, DevMem2D_<T> dst, const B b)
|
||||
{
|
||||
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
|
||||
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
__shared__ T smem1[10][10];
|
||||
__shared__ value_type smem2[20][16];
|
||||
|
||||
value_type sum;
|
||||
|
||||
if (threadIdx.x < 10 && threadIdx.y < 10)
|
||||
smem1[threadIdx.y][threadIdx.x] = b.at(blockIdx.y * blockDim.y / 2 + threadIdx.y - 1, blockIdx.x * blockDim.x / 2 + threadIdx.x - 1, src.data, src.step);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
const int tidx = threadIdx.x;
|
||||
|
||||
sum = VecTraits<value_type>::all(0);
|
||||
|
||||
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[1 + threadIdx.y / 2][1 + ((tidx - 2) >> 1)];
|
||||
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[1 + threadIdx.y / 2][1 + ((tidx - 1) >> 1)];
|
||||
sum = sum + (tidx % 2 == 0) * 0.375f * smem1[1 + threadIdx.y / 2][1 + ((tidx ) >> 1)];
|
||||
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[1 + threadIdx.y / 2][1 + ((tidx + 1) >> 1)];
|
||||
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[1 + threadIdx.y / 2][1 + ((tidx + 2) >> 1)];
|
||||
|
||||
smem2[2 + threadIdx.y][tidx] = sum;
|
||||
|
||||
if (threadIdx.y < 2)
|
||||
{
|
||||
sum = VecTraits<value_type>::all(0);
|
||||
|
||||
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[0][1 + ((tidx - 2) >> 1)];
|
||||
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[0][1 + ((tidx - 1) >> 1)];
|
||||
sum = sum + (tidx % 2 == 0) * 0.375f * smem1[0][1 + ((tidx ) >> 1)];
|
||||
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[0][1 + ((tidx + 1) >> 1)];
|
||||
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[0][1 + ((tidx + 2) >> 1)];
|
||||
|
||||
smem2[threadIdx.y][tidx] = sum;
|
||||
}
|
||||
|
||||
if (threadIdx.y > 13)
|
||||
{
|
||||
sum = VecTraits<value_type>::all(0);
|
||||
|
||||
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[9][1 + ((tidx - 2) >> 1)];
|
||||
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[9][1 + ((tidx - 1) >> 1)];
|
||||
sum = sum + (tidx % 2 == 0) * 0.375f * smem1[9][1 + ((tidx ) >> 1)];
|
||||
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[9][1 + ((tidx + 1) >> 1)];
|
||||
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[9][1 + ((tidx + 2) >> 1)];
|
||||
|
||||
smem2[4 + threadIdx.y][tidx] = sum;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
sum = VecTraits<value_type>::all(0);
|
||||
|
||||
sum = sum + (tidx % 2 == 0) * 0.0625f * smem2[2 + threadIdx.y - 2][tidx];
|
||||
sum = sum + (tidx % 2 != 0) * 0.25f * smem2[2 + threadIdx.y - 1][tidx];
|
||||
sum = sum + (tidx % 2 == 0) * 0.375f * smem2[2 + threadIdx.y ][tidx];
|
||||
sum = sum + (tidx % 2 != 0) * 0.25f * smem2[2 + threadIdx.y + 1][tidx];
|
||||
sum = sum + (tidx % 2 == 0) * 0.0625f * smem2[2 + threadIdx.y + 2][tidx];
|
||||
|
||||
if (x < dst.cols && y < dst.rows)
|
||||
dst.ptr(y)[x] = saturate_cast<T>(4.0f * sum);
|
||||
}
|
||||
|
||||
template <typename T, template <typename> class B> void pyrUp_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream)
|
||||
{
|
||||
const dim3 block(16, 16);
|
||||
const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
|
||||
|
||||
B<T> b(src.rows, src.cols);
|
||||
|
||||
pyrUp<T><<<grid, block, 0, stream>>>(src, dst, b);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template <typename T, int cn> void pyrUp_gpu(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream)
|
||||
{
|
||||
typedef typename TypeVec<T, cn>::vec_type type;
|
||||
|
||||
typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream);
|
||||
|
||||
static const caller_t callers[] =
|
||||
{
|
||||
pyrUp_caller<type, BrdReflect101>, pyrUp_caller<type, BrdReplicate>, pyrUp_caller<type, BrdConstant>, pyrUp_caller<type, BrdReflect>, pyrUp_caller<type, BrdWrap>
|
||||
};
|
||||
|
||||
callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream);
|
||||
}
|
||||
|
||||
template void pyrUp_gpu<uchar, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<uchar, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<uchar, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<uchar, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
|
||||
template void pyrUp_gpu<schar, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<schar, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<schar, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<schar, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
|
||||
template void pyrUp_gpu<ushort, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<ushort, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<ushort, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<ushort, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
|
||||
template void pyrUp_gpu<short, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<short, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<short, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<short, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
|
||||
template void pyrUp_gpu<int, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<int, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<int, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<int, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
|
||||
template void pyrUp_gpu<float, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<float, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<float, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<float, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
// buildWarpMaps
|
||||
|
185
modules/gpu/src/cuda/pyr_down.cu
Normal file
185
modules/gpu/src/cuda/pyr_down.cu
Normal file
@ -0,0 +1,185 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#include "internal_shared.hpp"
|
||||
#include "opencv2/gpu/device/border_interpolate.hpp"
|
||||
#include "opencv2/gpu/device/vec_traits.hpp"
|
||||
#include "opencv2/gpu/device/vec_math.hpp"
|
||||
#include "opencv2/gpu/device/saturate_cast.hpp"
|
||||
|
||||
using namespace cv::gpu;
|
||||
using namespace cv::gpu::device;
|
||||
|
||||
namespace cv { namespace gpu { namespace imgproc
|
||||
{
|
||||
template <typename T, typename B> __global__ void pyrDown(const PtrStep_<T> src, PtrStep_<T> dst, const B b, int dst_cols)
|
||||
{
|
||||
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
|
||||
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y;
|
||||
|
||||
__shared__ value_type smem[256 + 4];
|
||||
|
||||
value_type sum;
|
||||
|
||||
const int src_y = 2*y;
|
||||
|
||||
sum = VecTraits<value_type>::all(0);
|
||||
|
||||
sum = sum + 0.0625f * b.at(src_y - 2, x, src.data, src.step);
|
||||
sum = sum + 0.25f * b.at(src_y - 1, x, src.data, src.step);
|
||||
sum = sum + 0.375f * b.at(src_y , x, src.data, src.step);
|
||||
sum = sum + 0.25f * b.at(src_y + 1, x, src.data, src.step);
|
||||
sum = sum + 0.0625f * b.at(src_y + 2, x, src.data, src.step);
|
||||
|
||||
smem[2 + threadIdx.x] = sum;
|
||||
|
||||
if (threadIdx.x < 2)
|
||||
{
|
||||
const int left_x = x - 2 + threadIdx.x;
|
||||
|
||||
sum = VecTraits<value_type>::all(0);
|
||||
|
||||
sum = sum + 0.0625f * b.at(src_y - 2, left_x, src.data, src.step);
|
||||
sum = sum + 0.25f * b.at(src_y - 1, left_x, src.data, src.step);
|
||||
sum = sum + 0.375f * b.at(src_y , left_x, src.data, src.step);
|
||||
sum = sum + 0.25f * b.at(src_y + 1, left_x, src.data, src.step);
|
||||
sum = sum + 0.0625f * b.at(src_y + 2, left_x, src.data, src.step);
|
||||
|
||||
smem[threadIdx.x] = sum;
|
||||
}
|
||||
|
||||
if (threadIdx.x > 253)
|
||||
{
|
||||
const int right_x = x + threadIdx.x + 2;
|
||||
|
||||
sum = VecTraits<value_type>::all(0);
|
||||
|
||||
sum = sum + 0.0625f * b.at(src_y - 2, right_x, src.data, src.step);
|
||||
sum = sum + 0.25f * b.at(src_y - 1, right_x, src.data, src.step);
|
||||
sum = sum + 0.375f * b.at(src_y , right_x, src.data, src.step);
|
||||
sum = sum + 0.25f * b.at(src_y + 1, right_x, src.data, src.step);
|
||||
sum = sum + 0.0625f * b.at(src_y + 2, right_x, src.data, src.step);
|
||||
|
||||
smem[4 + threadIdx.x] = sum;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x < 128)
|
||||
{
|
||||
const int tid2 = threadIdx.x * 2;
|
||||
|
||||
sum = VecTraits<value_type>::all(0);
|
||||
|
||||
sum = sum + 0.0625f * smem[2 + tid2 - 2];
|
||||
sum = sum + 0.25f * smem[2 + tid2 - 1];
|
||||
sum = sum + 0.375f * smem[2 + tid2 ];
|
||||
sum = sum + 0.25f * smem[2 + tid2 + 1];
|
||||
sum = sum + 0.0625f * smem[2 + tid2 + 2];
|
||||
|
||||
const int dst_x = (blockIdx.x * blockDim.x + tid2) / 2;
|
||||
|
||||
if (dst_x < dst_cols)
|
||||
dst.ptr(y)[dst_x] = saturate_cast<T>(sum);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, template <typename> class B> void pyrDown_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream)
|
||||
{
|
||||
const dim3 block(256);
|
||||
const dim3 grid(divUp(src.cols, block.x), dst.rows);
|
||||
|
||||
B<T> b(src.rows, src.cols);
|
||||
|
||||
pyrDown<T><<<grid, block, 0, stream>>>(src, dst, b, dst.cols);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template <typename T, int cn> void pyrDown_gpu(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream)
|
||||
{
|
||||
typedef typename TypeVec<T, cn>::vec_type type;
|
||||
|
||||
typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream);
|
||||
|
||||
static const caller_t callers[] =
|
||||
{
|
||||
pyrDown_caller<type, BrdReflect101>, pyrDown_caller<type, BrdReplicate>, pyrDown_caller<type, BrdConstant>, pyrDown_caller<type, BrdReflect>, pyrDown_caller<type, BrdWrap>
|
||||
};
|
||||
|
||||
callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream);
|
||||
}
|
||||
|
||||
template void pyrDown_gpu<uchar, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<uchar, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<uchar, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<uchar, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
|
||||
template void pyrDown_gpu<schar, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<schar, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<schar, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<schar, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
|
||||
template void pyrDown_gpu<ushort, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<ushort, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<ushort, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<ushort, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
|
||||
template void pyrDown_gpu<short, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<short, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<short, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<short, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
|
||||
template void pyrDown_gpu<int, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<int, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<int, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<int, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
|
||||
template void pyrDown_gpu<float, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<float, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<float, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<float, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
}}}
|
180
modules/gpu/src/cuda/pyr_up.cu
Normal file
180
modules/gpu/src/cuda/pyr_up.cu
Normal file
@ -0,0 +1,180 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#include "internal_shared.hpp"
|
||||
#include "opencv2/gpu/device/border_interpolate.hpp"
|
||||
#include "opencv2/gpu/device/vec_traits.hpp"
|
||||
#include "opencv2/gpu/device/vec_math.hpp"
|
||||
#include "opencv2/gpu/device/saturate_cast.hpp"
|
||||
|
||||
using namespace cv::gpu;
|
||||
using namespace cv::gpu::device;
|
||||
|
||||
namespace cv { namespace gpu { namespace imgproc
|
||||
{
|
||||
template <typename T, typename B> __global__ void pyrUp(const PtrStep_<T> src, DevMem2D_<T> dst, const B b)
|
||||
{
|
||||
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
|
||||
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
__shared__ T smem1[10][10];
|
||||
__shared__ value_type smem2[20][16];
|
||||
|
||||
value_type sum;
|
||||
|
||||
if (threadIdx.x < 10 && threadIdx.y < 10)
|
||||
smem1[threadIdx.y][threadIdx.x] = b.at(blockIdx.y * blockDim.y / 2 + threadIdx.y - 1, blockIdx.x * blockDim.x / 2 + threadIdx.x - 1, src.data, src.step);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
const int tidx = threadIdx.x;
|
||||
|
||||
sum = VecTraits<value_type>::all(0);
|
||||
|
||||
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[1 + threadIdx.y / 2][1 + ((tidx - 2) >> 1)];
|
||||
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[1 + threadIdx.y / 2][1 + ((tidx - 1) >> 1)];
|
||||
sum = sum + (tidx % 2 == 0) * 0.375f * smem1[1 + threadIdx.y / 2][1 + ((tidx ) >> 1)];
|
||||
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[1 + threadIdx.y / 2][1 + ((tidx + 1) >> 1)];
|
||||
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[1 + threadIdx.y / 2][1 + ((tidx + 2) >> 1)];
|
||||
|
||||
smem2[2 + threadIdx.y][tidx] = sum;
|
||||
|
||||
if (threadIdx.y < 2)
|
||||
{
|
||||
sum = VecTraits<value_type>::all(0);
|
||||
|
||||
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[0][1 + ((tidx - 2) >> 1)];
|
||||
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[0][1 + ((tidx - 1) >> 1)];
|
||||
sum = sum + (tidx % 2 == 0) * 0.375f * smem1[0][1 + ((tidx ) >> 1)];
|
||||
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[0][1 + ((tidx + 1) >> 1)];
|
||||
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[0][1 + ((tidx + 2) >> 1)];
|
||||
|
||||
smem2[threadIdx.y][tidx] = sum;
|
||||
}
|
||||
|
||||
if (threadIdx.y > 13)
|
||||
{
|
||||
sum = VecTraits<value_type>::all(0);
|
||||
|
||||
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[9][1 + ((tidx - 2) >> 1)];
|
||||
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[9][1 + ((tidx - 1) >> 1)];
|
||||
sum = sum + (tidx % 2 == 0) * 0.375f * smem1[9][1 + ((tidx ) >> 1)];
|
||||
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[9][1 + ((tidx + 1) >> 1)];
|
||||
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[9][1 + ((tidx + 2) >> 1)];
|
||||
|
||||
smem2[4 + threadIdx.y][tidx] = sum;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
sum = VecTraits<value_type>::all(0);
|
||||
|
||||
sum = sum + (tidx % 2 == 0) * 0.0625f * smem2[2 + threadIdx.y - 2][tidx];
|
||||
sum = sum + (tidx % 2 != 0) * 0.25f * smem2[2 + threadIdx.y - 1][tidx];
|
||||
sum = sum + (tidx % 2 == 0) * 0.375f * smem2[2 + threadIdx.y ][tidx];
|
||||
sum = sum + (tidx % 2 != 0) * 0.25f * smem2[2 + threadIdx.y + 1][tidx];
|
||||
sum = sum + (tidx % 2 == 0) * 0.0625f * smem2[2 + threadIdx.y + 2][tidx];
|
||||
|
||||
if (x < dst.cols && y < dst.rows)
|
||||
dst.ptr(y)[x] = saturate_cast<T>(4.0f * sum);
|
||||
}
|
||||
|
||||
template <typename T, template <typename> class B> void pyrUp_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream)
|
||||
{
|
||||
const dim3 block(16, 16);
|
||||
const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
|
||||
|
||||
B<T> b(src.rows, src.cols);
|
||||
|
||||
pyrUp<T><<<grid, block, 0, stream>>>(src, dst, b);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template <typename T, int cn> void pyrUp_gpu(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream)
|
||||
{
|
||||
typedef typename TypeVec<T, cn>::vec_type type;
|
||||
|
||||
typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream);
|
||||
|
||||
static const caller_t callers[] =
|
||||
{
|
||||
pyrUp_caller<type, BrdReflect101>, pyrUp_caller<type, BrdReplicate>, pyrUp_caller<type, BrdConstant>, pyrUp_caller<type, BrdReflect>, pyrUp_caller<type, BrdWrap>
|
||||
};
|
||||
|
||||
callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream);
|
||||
}
|
||||
|
||||
template void pyrUp_gpu<uchar, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<uchar, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<uchar, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<uchar, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
|
||||
template void pyrUp_gpu<schar, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<schar, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<schar, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<schar, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
|
||||
template void pyrUp_gpu<ushort, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<ushort, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<ushort, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<ushort, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
|
||||
template void pyrUp_gpu<short, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<short, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<short, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<short, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
|
||||
template void pyrUp_gpu<int, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<int, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<int, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<int, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
|
||||
template void pyrUp_gpu<float, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<float, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<float, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<float, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
|
||||
}}}
|
249
modules/gpu/src/cuda/remap.cu
Normal file
249
modules/gpu/src/cuda/remap.cu
Normal file
@ -0,0 +1,249 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#include "internal_shared.hpp"
|
||||
#include "opencv2/gpu/device/border_interpolate.hpp"
|
||||
#include "opencv2/gpu/device/vec_traits.hpp"
|
||||
#include "opencv2/gpu/device/vec_math.hpp"
|
||||
#include "opencv2/gpu/device/saturate_cast.hpp"
|
||||
#include "opencv2/gpu/device/filters.hpp"
|
||||
|
||||
using namespace cv::gpu;
|
||||
using namespace cv::gpu::device;
|
||||
|
||||
namespace cv { namespace gpu { namespace imgproc
|
||||
{
|
||||
|
||||
template <typename Ptr2D, typename T> __global__ void remap(const Ptr2D src, const PtrStepf mapx, const PtrStepf mapy, DevMem2D_<T> dst)
|
||||
{
|
||||
const int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
const int y = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
|
||||
if (x < dst.cols && y < dst.rows)
|
||||
{
|
||||
const float xcoo = mapx.ptr(y)[x];
|
||||
const float ycoo = mapy.ptr(y)[x];
|
||||
|
||||
dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));
|
||||
}
|
||||
}
|
||||
|
||||
template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherStream
|
||||
{
|
||||
static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, const float* borderValue, cudaStream_t stream)
|
||||
{
|
||||
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
|
||||
|
||||
dim3 block(32, 8);
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
|
||||
|
||||
B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
|
||||
BorderReader< PtrStep_<T>, B<work_type> > brdSrc(src, brd);
|
||||
Filter< BorderReader< PtrStep_<T>, B<work_type> > > filter_src(brdSrc);
|
||||
|
||||
remap<<<grid, block, 0, stream>>>(filter_src, mapx, mapy, dst);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
}
|
||||
};
|
||||
|
||||
template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherNonStream
|
||||
{
|
||||
static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, const float* borderValue)
|
||||
{
|
||||
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
|
||||
|
||||
dim3 block(32, 8);
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
|
||||
|
||||
B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
|
||||
BorderReader< PtrStep_<T>, B<work_type> > brdSrc(src, brd);
|
||||
Filter< BorderReader< PtrStep_<T>, B<work_type> > > filter_src(brdSrc);
|
||||
|
||||
remap<<<grid, block>>>(filter_src, mapx, mapy, dst);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
};
|
||||
|
||||
#define OPENCV_GPU_IMPLEMENT_REMAP_TEX(type) \
|
||||
texture< type , cudaTextureType2D> tex_remap_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
|
||||
struct tex_remap_ ## type ## _reader \
|
||||
{ \
|
||||
typedef type elem_type; \
|
||||
typedef int index_type; \
|
||||
__device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
|
||||
{ \
|
||||
return tex2D(tex_remap_ ## type , x, y); \
|
||||
} \
|
||||
}; \
|
||||
template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, type> \
|
||||
{ \
|
||||
static void call(const DevMem2D_< type >& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_< type >& dst, const float* borderValue) \
|
||||
{ \
|
||||
typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
|
||||
dim3 block(32, 8); \
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
|
||||
TextureBinder texHandler(&tex_remap_ ## type , src); \
|
||||
tex_remap_ ## type ##_reader texSrc; \
|
||||
B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \
|
||||
BorderReader< tex_remap_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \
|
||||
Filter< BorderReader< tex_remap_ ## type ##_reader, B<work_type> > > filter_src(brdSrc); \
|
||||
remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
|
||||
cudaSafeCall( cudaGetLastError() ); \
|
||||
cudaSafeCall( cudaDeviceSynchronize() ); \
|
||||
} \
|
||||
}; \
|
||||
template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, type> \
|
||||
{ \
|
||||
static void call(const DevMem2D_< type >& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_< type >& dst, const float*) \
|
||||
{ \
|
||||
dim3 block(32, 8); \
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
|
||||
TextureBinder texHandler(&tex_remap_ ## type , src); \
|
||||
tex_remap_ ## type ##_reader texSrc; \
|
||||
Filter< tex_remap_ ## type ##_reader > filter_src(texSrc); \
|
||||
remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
|
||||
cudaSafeCall( cudaGetLastError() ); \
|
||||
cudaSafeCall( cudaDeviceSynchronize() ); \
|
||||
} \
|
||||
};
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar)
|
||||
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar2)
|
||||
OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar4)
|
||||
|
||||
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(schar)
|
||||
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(char2)
|
||||
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(char4)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort)
|
||||
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort2)
|
||||
OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort4)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_REMAP_TEX(short)
|
||||
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(short2)
|
||||
OPENCV_GPU_IMPLEMENT_REMAP_TEX(short4)
|
||||
|
||||
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(int)
|
||||
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(int2)
|
||||
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(int4)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_REMAP_TEX(float)
|
||||
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(float2)
|
||||
OPENCV_GPU_IMPLEMENT_REMAP_TEX(float4)
|
||||
|
||||
#undef OPENCV_GPU_IMPLEMENT_REMAP_TEX
|
||||
|
||||
template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcher
|
||||
{
|
||||
static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, const float* borderValue, cudaStream_t stream)
|
||||
{
|
||||
if (stream == 0)
|
||||
RemapDispatcherNonStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue);
|
||||
else
|
||||
RemapDispatcherStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, stream);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T> void remap_gpu(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream)
|
||||
{
|
||||
typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D_<T>& dst, const float* borderValue, cudaStream_t stream);
|
||||
|
||||
static const caller_t callers[3][5] =
|
||||
{
|
||||
{
|
||||
RemapDispatcher<PointFilter, BrdReflect101, T>::call,
|
||||
RemapDispatcher<PointFilter, BrdReplicate, T>::call,
|
||||
RemapDispatcher<PointFilter, BrdConstant, T>::call,
|
||||
RemapDispatcher<PointFilter, BrdReflect, T>::call,
|
||||
RemapDispatcher<PointFilter, BrdWrap, T>::call
|
||||
},
|
||||
{
|
||||
RemapDispatcher<LinearFilter, BrdReflect101, T>::call,
|
||||
RemapDispatcher<LinearFilter, BrdReplicate, T>::call,
|
||||
RemapDispatcher<LinearFilter, BrdConstant, T>::call,
|
||||
RemapDispatcher<LinearFilter, BrdReflect, T>::call,
|
||||
RemapDispatcher<LinearFilter, BrdWrap, T>::call
|
||||
},
|
||||
{
|
||||
RemapDispatcher<CubicFilter, BrdReflect101, T>::call,
|
||||
RemapDispatcher<CubicFilter, BrdReplicate, T>::call,
|
||||
RemapDispatcher<CubicFilter, BrdConstant, T>::call,
|
||||
RemapDispatcher<CubicFilter, BrdReflect, T>::call,
|
||||
RemapDispatcher<CubicFilter, BrdWrap, T>::call
|
||||
}
|
||||
};
|
||||
|
||||
callers[interpolation][borderMode](static_cast< DevMem2D_<T> >(src), xmap, ymap, static_cast< DevMem2D_<T> >(dst), borderValue, stream);
|
||||
}
|
||||
|
||||
template void remap_gpu<uchar >(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
//template void remap_gpu<uchar2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
template void remap_gpu<uchar3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
template void remap_gpu<uchar4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
|
||||
//template void remap_gpu<schar>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
//template void remap_gpu<char2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
//template void remap_gpu<char3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
//template void remap_gpu<char4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
|
||||
template void remap_gpu<ushort >(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
//template void remap_gpu<ushort2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
template void remap_gpu<ushort3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
template void remap_gpu<ushort4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
|
||||
template void remap_gpu<short >(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
//template void remap_gpu<short2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
template void remap_gpu<short3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
template void remap_gpu<short4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
|
||||
//template void remap_gpu<int >(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
//template void remap_gpu<int2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
//template void remap_gpu<int3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
//template void remap_gpu<int4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
|
||||
template void remap_gpu<float >(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
//template void remap_gpu<float2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
template void remap_gpu<float3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
template void remap_gpu<float4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
}}}
|
264
modules/gpu/src/cuda/resize.cu
Normal file
264
modules/gpu/src/cuda/resize.cu
Normal file
@ -0,0 +1,264 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#include "internal_shared.hpp"
|
||||
#include "opencv2/gpu/device/border_interpolate.hpp"
|
||||
#include "opencv2/gpu/device/vec_traits.hpp"
|
||||
#include "opencv2/gpu/device/vec_math.hpp"
|
||||
#include "opencv2/gpu/device/saturate_cast.hpp"
|
||||
#include "opencv2/gpu/device/filters.hpp"
|
||||
|
||||
using namespace cv::gpu;
|
||||
using namespace cv::gpu::device;
|
||||
|
||||
namespace cv { namespace gpu { namespace imgproc
|
||||
{
|
||||
|
||||
template <typename Ptr2D, typename T> __global__ void resize(const Ptr2D src, float fx, float fy, DevMem2D_<T> dst)
|
||||
{
|
||||
const int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
const int y = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
|
||||
if (x < dst.cols && y < dst.rows)
|
||||
{
|
||||
const float xcoo = x / fx;
|
||||
const float ycoo = y / fy;
|
||||
|
||||
dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));
|
||||
}
|
||||
}
|
||||
template <typename Ptr2D, typename T> __global__ void resizeNN(const Ptr2D src, float fx, float fy, DevMem2D_<T> dst)
|
||||
{
|
||||
const int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
const int y = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
|
||||
if (x < dst.cols && y < dst.rows)
|
||||
{
|
||||
const float xcoo = x / fx;
|
||||
const float ycoo = y / fy;
|
||||
|
||||
dst.ptr(y)[x] = src(__float2int_rd(ycoo), __float2int_rd(xcoo));
|
||||
}
|
||||
}
|
||||
|
||||
template <template <typename> class Filter, typename T> struct ResizeDispatcherStream
|
||||
{
|
||||
static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)
|
||||
{
|
||||
dim3 block(32, 8);
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
|
||||
|
||||
BrdReplicate<T> brd(src.rows, src.cols);
|
||||
BorderReader< PtrStep_<T>, BrdReplicate<T> > brdSrc(src, brd);
|
||||
Filter< BorderReader< PtrStep_<T>, BrdReplicate<T> > > filter_src(brdSrc);
|
||||
|
||||
resize<<<grid, block, 0, stream>>>(filter_src, fx, fy, dst);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
}
|
||||
};
|
||||
template <typename T> struct ResizeDispatcherStream<PointFilter, T>
|
||||
{
|
||||
static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)
|
||||
{
|
||||
dim3 block(32, 8);
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
|
||||
|
||||
BrdReplicate<T> brd(src.rows, src.cols);
|
||||
BorderReader< PtrStep_<T>, BrdReplicate<T> > brdSrc(src, brd);
|
||||
|
||||
resizeNN<<<grid, block, 0, stream>>>(brdSrc, fx, fy, dst);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
}
|
||||
};
|
||||
|
||||
template <template <typename> class Filter, typename T> struct ResizeDispatcherNonStream
|
||||
{
|
||||
static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst)
|
||||
{
|
||||
dim3 block(32, 8);
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
|
||||
|
||||
BrdReplicate<T> brd(src.rows, src.cols);
|
||||
BorderReader< PtrStep_<T>, BrdReplicate<T> > brdSrc(src, brd);
|
||||
Filter< BorderReader< PtrStep_<T>, BrdReplicate<T> > > filter_src(brdSrc);
|
||||
|
||||
resize<<<grid, block>>>(filter_src, fx, fy, dst);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
};
|
||||
template <typename T> struct ResizeDispatcherNonStream<PointFilter, T>
|
||||
{
|
||||
static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst)
|
||||
{
|
||||
dim3 block(32, 8);
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
|
||||
|
||||
BrdReplicate<T> brd(src.rows, src.cols);
|
||||
BorderReader< PtrStep_<T>, BrdReplicate<T> > brdSrc(src, brd);
|
||||
|
||||
resizeNN<<<grid, block>>>(brdSrc, fx, fy, dst);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
};
|
||||
|
||||
#define OPENCV_GPU_IMPLEMENT_RESIZE_TEX(type) \
|
||||
texture< type , cudaTextureType2D> tex_resize_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
|
||||
struct tex_resize_ ## type ## _reader \
|
||||
{ \
|
||||
typedef type elem_type; \
|
||||
typedef int index_type; \
|
||||
__device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
|
||||
{ \
|
||||
return tex2D(tex_resize_ ## type , x, y); \
|
||||
} \
|
||||
}; \
|
||||
template <template <typename> class Filter> struct ResizeDispatcherNonStream<Filter, type> \
|
||||
{ \
|
||||
static void call(const DevMem2D_< type >& src, float fx, float fy, const DevMem2D_< type >& dst) \
|
||||
{ \
|
||||
dim3 block(32, 8); \
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
|
||||
TextureBinder texHandler(&tex_resize_ ## type , src); \
|
||||
tex_resize_ ## type ##_reader texSrc; \
|
||||
Filter< tex_resize_ ## type ##_reader > filter_src(texSrc); \
|
||||
resize<<<grid, block>>>(filter_src, fx, fy, dst); \
|
||||
cudaSafeCall( cudaGetLastError() ); \
|
||||
cudaSafeCall( cudaDeviceSynchronize() ); \
|
||||
} \
|
||||
}; \
|
||||
template <> struct ResizeDispatcherNonStream<PointFilter, type> \
|
||||
{ \
|
||||
static void call(const DevMem2D_< type >& src, float fx, float fy, const DevMem2D_< type >& dst) \
|
||||
{ \
|
||||
dim3 block(32, 8); \
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
|
||||
TextureBinder texHandler(&tex_resize_ ## type , src); \
|
||||
tex_resize_ ## type ##_reader texSrc; \
|
||||
resizeNN<<<grid, block>>>(texSrc, fx, fy, dst); \
|
||||
cudaSafeCall( cudaGetLastError() ); \
|
||||
cudaSafeCall( cudaDeviceSynchronize() ); \
|
||||
} \
|
||||
};
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar)
|
||||
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar2)
|
||||
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar4)
|
||||
|
||||
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(schar)
|
||||
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char2)
|
||||
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char4)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort)
|
||||
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort2)
|
||||
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort4)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short)
|
||||
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short2)
|
||||
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short4)
|
||||
|
||||
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int)
|
||||
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int2)
|
||||
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int4)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float)
|
||||
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float2)
|
||||
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float4)
|
||||
|
||||
#undef OPENCV_GPU_IMPLEMENT_RESIZE_TEX
|
||||
|
||||
template <template <typename> class Filter, typename T> struct ResizeDispatcher
|
||||
{
|
||||
static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)
|
||||
{
|
||||
if (stream == 0)
|
||||
ResizeDispatcherNonStream<Filter, T>::call(src, fx, fy, dst);
|
||||
else
|
||||
ResizeDispatcherStream<Filter, T>::call(src, fx, fy, dst, stream);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T> void resize_gpu(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream)
|
||||
{
|
||||
typedef void (*caller_t)(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream);
|
||||
|
||||
static const caller_t callers[3] =
|
||||
{
|
||||
ResizeDispatcher<PointFilter, T>::call, ResizeDispatcher<LinearFilter, T>::call, ResizeDispatcher<CubicFilter, T>::call
|
||||
};
|
||||
|
||||
callers[interpolation](static_cast< DevMem2D_<T> >(src), fx, fy, static_cast< DevMem2D_<T> >(dst), stream);
|
||||
}
|
||||
|
||||
template void resize_gpu<uchar >(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
|
||||
//template void resize_gpu<uchar2>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
|
||||
template void resize_gpu<uchar3>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
|
||||
template void resize_gpu<uchar4>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
|
||||
|
||||
//template void resize_gpu<schar>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
|
||||
//template void resize_gpu<char2>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
|
||||
//template void resize_gpu<char3>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
|
||||
//template void resize_gpu<char4>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
|
||||
|
||||
template void resize_gpu<ushort >(const DevMem2D& src,float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
|
||||
//template void resize_gpu<ushort2>(const DevMem2D& src,float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
|
||||
template void resize_gpu<ushort3>(const DevMem2D& src,float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
|
||||
template void resize_gpu<ushort4>(const DevMem2D& src,float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
|
||||
|
||||
template void resize_gpu<short >(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
|
||||
//template void resize_gpu<short2>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
|
||||
template void resize_gpu<short3>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
|
||||
template void resize_gpu<short4>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
|
||||
|
||||
//template void resize_gpu<int >(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
|
||||
//template void resize_gpu<int2>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
|
||||
//template void resize_gpu<int3>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
|
||||
//template void resize_gpu<int4>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
|
||||
|
||||
template void resize_gpu<float >(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
|
||||
//template void resize_gpu<float2>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
|
||||
template void resize_gpu<float3>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
|
||||
template void resize_gpu<float4>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
|
||||
}}}
|
256
modules/gpu/src/cuda/row_filter.cu
Normal file
256
modules/gpu/src/cuda/row_filter.cu
Normal file
@ -0,0 +1,256 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#include "internal_shared.hpp"
|
||||
#include "opencv2/gpu/device/saturate_cast.hpp"
|
||||
#include "opencv2/gpu/device/vec_math.hpp"
|
||||
#include "opencv2/gpu/device/limits.hpp"
|
||||
#include "opencv2/gpu/device/border_interpolate.hpp"
|
||||
|
||||
using namespace cv::gpu;
|
||||
using namespace cv::gpu::device;
|
||||
|
||||
#define MAX_KERNEL_SIZE 16
|
||||
#define BLOCK_DIM_X 16
|
||||
#define BLOCK_DIM_Y 16
|
||||
|
||||
namespace filter_krnls_row
|
||||
{
|
||||
__constant__ float cLinearKernel[MAX_KERNEL_SIZE];
|
||||
|
||||
void loadLinearKernel(const float kernel[], int ksize)
|
||||
{
|
||||
cudaSafeCall( cudaMemcpyToSymbol(cLinearKernel, kernel, ksize * sizeof(float)) );
|
||||
}
|
||||
|
||||
template <typename T, size_t size> struct SmemType_
|
||||
{
|
||||
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type smem_t;
|
||||
};
|
||||
template <typename T> struct SmemType_<T, 4>
|
||||
{
|
||||
typedef T smem_t;
|
||||
};
|
||||
template <typename T> struct SmemType
|
||||
{
|
||||
typedef typename SmemType_<T, sizeof(T)>::smem_t smem_t;
|
||||
};
|
||||
|
||||
template <int ksize, typename T, typename D, typename B>
|
||||
__global__ void linearRowFilter(const DevMem2D_<T> src, PtrStep_<D> dst, int anchor, const B b)
|
||||
{
|
||||
typedef typename SmemType<T>::smem_t smem_t;
|
||||
|
||||
__shared__ smem_t smem[BLOCK_DIM_Y * BLOCK_DIM_X * 3];
|
||||
|
||||
const int x = BLOCK_DIM_X * blockIdx.x + threadIdx.x;
|
||||
const int y = BLOCK_DIM_Y * blockIdx.y + threadIdx.y;
|
||||
|
||||
smem_t* sDataRow = smem + threadIdx.y * BLOCK_DIM_X * 3;
|
||||
|
||||
if (y < src.rows)
|
||||
{
|
||||
const T* rowSrc = src.ptr(y);
|
||||
|
||||
sDataRow[threadIdx.x ] = b.at_low(x - BLOCK_DIM_X, rowSrc);
|
||||
sDataRow[threadIdx.x + BLOCK_DIM_X ] = b.at_high(x, rowSrc);
|
||||
sDataRow[threadIdx.x + BLOCK_DIM_X * 2] = b.at_high(x + BLOCK_DIM_X, rowSrc);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (x < src.cols)
|
||||
{
|
||||
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
|
||||
sum_t sum = VecTraits<sum_t>::all(0);
|
||||
|
||||
sDataRow += threadIdx.x + BLOCK_DIM_X - anchor;
|
||||
|
||||
#pragma unroll
|
||||
for(int i = 0; i < ksize; ++i)
|
||||
sum = sum + sDataRow[i] * cLinearKernel[i];
|
||||
|
||||
dst.ptr(y)[x] = saturate_cast<D>(sum);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
namespace cv { namespace gpu { namespace filters
|
||||
{
|
||||
template <int ksize, typename T, typename D, template<typename> class B>
|
||||
void linearRowFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);
|
||||
dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
|
||||
|
||||
typedef typename filter_krnls_row::SmemType<T>::smem_t smem_t;
|
||||
B<smem_t> b(src.cols);
|
||||
|
||||
if (!b.is_range_safe(-BLOCK_DIM_X, (grid.x + 1) * BLOCK_DIM_X - 1))
|
||||
{
|
||||
cv::gpu::error("linearRowFilter: can't use specified border extrapolation, image is too small, "
|
||||
"try bigger image or another border extrapolation mode", __FILE__, __LINE__);
|
||||
}
|
||||
|
||||
filter_krnls_row::linearRowFilter<ksize, T, D><<<grid, threads, 0, stream>>>(src, dst, anchor, b);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template <typename T, typename D>
|
||||
void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream)
|
||||
{
|
||||
typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream);
|
||||
static const caller_t callers[5][17] =
|
||||
{
|
||||
{
|
||||
0,
|
||||
linearRowFilter_caller<1 , T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<2 , T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<3 , T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<4 , T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<5 , T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<6 , T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<7 , T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<8 , T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<9 , T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<10, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<11, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<12, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<13, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<14, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<15, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<16, T, D, BrdRowReflect101>
|
||||
},
|
||||
{
|
||||
0,
|
||||
linearRowFilter_caller<1 , T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<2 , T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<3 , T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<4 , T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<5 , T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<6 , T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<7 , T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<8 , T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<9 , T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<10, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<11, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<12, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<13, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<14, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<15, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<16, T, D, BrdRowReplicate>
|
||||
},
|
||||
{
|
||||
0,
|
||||
linearRowFilter_caller<1 , T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<2 , T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<3 , T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<4 , T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<5 , T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<6 , T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<7 , T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<8 , T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<9 , T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<10, T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<11, T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<12, T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<13, T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<14, T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<15, T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<16, T, D, BrdRowConstant>
|
||||
},
|
||||
{
|
||||
0,
|
||||
linearRowFilter_caller<1 , T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<2 , T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<3 , T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<4 , T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<5 , T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<6 , T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<7 , T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<8 , T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<9 , T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<10, T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<11, T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<12, T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<13, T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<14, T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<15, T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<16, T, D, BrdRowReflect>
|
||||
},
|
||||
{
|
||||
0,
|
||||
linearRowFilter_caller<1 , T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<2 , T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<3 , T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<4 , T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<5 , T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<6 , T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<7 , T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<8 , T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<9 , T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<10, T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<11, T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<12, T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<13, T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<14, T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<15, T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<16, T, D, BrdRowWrap>
|
||||
}
|
||||
};
|
||||
|
||||
filter_krnls_row::loadLinearKernel(kernel, ksize);
|
||||
|
||||
callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);
|
||||
}
|
||||
|
||||
template void linearRowFilter_gpu<uchar , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearRowFilter_gpu<uchar4, float4>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
//template void linearRowFilter_gpu<short , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
//template void linearRowFilter_gpu<short2, float2>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearRowFilter_gpu<short3, float3>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearRowFilter_gpu<int , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearRowFilter_gpu<float , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
}}}
|
@ -722,7 +722,7 @@ Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType,
|
||||
int gpuBorderType;
|
||||
CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType));
|
||||
|
||||
CV_Assert(srcType == CV_8UC1 || srcType == CV_8UC4 || srcType == CV_16SC1 || srcType == CV_16SC2
|
||||
CV_Assert(srcType == CV_8UC1 || srcType == CV_8UC4 /*|| srcType == CV_16SC1*/ /*|| srcType == CV_16SC2*/
|
||||
|| srcType == CV_16SC3 || srcType == CV_32SC1 || srcType == CV_32FC1);
|
||||
|
||||
CV_Assert(CV_MAT_DEPTH(bufType) == CV_32F && CV_MAT_CN(srcType) == CV_MAT_CN(bufType));
|
||||
@ -747,12 +747,12 @@ Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType,
|
||||
case CV_8UC4:
|
||||
func = filters::linearRowFilter_gpu<uchar4, float4>;
|
||||
break;
|
||||
case CV_16SC1:
|
||||
/*case CV_16SC1:
|
||||
func = filters::linearRowFilter_gpu<short, float>;
|
||||
break;
|
||||
case CV_16SC2:
|
||||
break;*/
|
||||
/*case CV_16SC2:
|
||||
func = filters::linearRowFilter_gpu<short2, float2>;
|
||||
break;
|
||||
break;*/
|
||||
case CV_16SC3:
|
||||
func = filters::linearRowFilter_gpu<short3, float3>;
|
||||
break;
|
||||
@ -837,7 +837,7 @@ Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int ds
|
||||
int gpuBorderType;
|
||||
CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType));
|
||||
|
||||
CV_Assert(dstType == CV_8UC1 || dstType == CV_8UC4 || dstType == CV_16SC1 || dstType == CV_16SC2
|
||||
CV_Assert(dstType == CV_8UC1 || dstType == CV_8UC4 /*|| dstType == CV_16SC1*/ /*|| dstType == CV_16SC2*/
|
||||
|| dstType == CV_16SC3 || dstType == CV_32SC1 || dstType == CV_32FC1);
|
||||
|
||||
CV_Assert(CV_MAT_DEPTH(bufType) == CV_32F && CV_MAT_CN(dstType) == CV_MAT_CN(bufType));
|
||||
@ -862,12 +862,12 @@ Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int ds
|
||||
case CV_8UC4:
|
||||
func = filters::linearColumnFilter_gpu<float4, uchar4>;
|
||||
break;
|
||||
case CV_16SC1:
|
||||
/*case CV_16SC1:
|
||||
func = filters::linearColumnFilter_gpu<float, short>;
|
||||
break;
|
||||
case CV_16SC2:
|
||||
break;*/
|
||||
/*case CV_16SC2:
|
||||
func = filters::linearColumnFilter_gpu<float2, short2>;
|
||||
break;
|
||||
break;*/
|
||||
case CV_16SC3:
|
||||
func = filters::linearColumnFilter_gpu<float3, short3>;
|
||||
break;
|
||||
|
@ -90,8 +90,6 @@ void cv::gpu::dft(const GpuMat&, GpuMat&, Size, int) { throw_nogpu(); }
|
||||
void cv::gpu::ConvolveBuf::create(Size, Size) { throw_nogpu(); }
|
||||
void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool) { throw_nogpu(); }
|
||||
void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool, ConvolveBuf&) { throw_nogpu(); }
|
||||
void cv::gpu::downsample(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
|
||||
void cv::gpu::upsample(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
|
||||
void cv::gpu::pyrDown(const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); }
|
||||
void cv::gpu::pyrUp(const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); }
|
||||
void cv::gpu::Canny(const GpuMat&, GpuMat&, double, double, int, bool) { throw_nogpu(); }
|
||||
@ -120,17 +118,20 @@ void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const Gp
|
||||
typedef void (*caller_t)(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
static const caller_t callers[6][4] =
|
||||
{
|
||||
{remap_gpu<uchar>, remap_gpu<uchar2>, remap_gpu<uchar3>, remap_gpu<uchar4>},
|
||||
{remap_gpu<schar>, remap_gpu<char2>, remap_gpu<char3>, remap_gpu<char4>},
|
||||
{remap_gpu<ushort>, remap_gpu<ushort2>, remap_gpu<ushort3>, remap_gpu<ushort4>},
|
||||
{remap_gpu<short>, remap_gpu<short2>, remap_gpu<short3>, remap_gpu<short4>},
|
||||
{remap_gpu<int>, remap_gpu<int2>, remap_gpu<int3>, remap_gpu<int4>},
|
||||
{remap_gpu<float>, remap_gpu<float2>, remap_gpu<float3>, remap_gpu<float4>}
|
||||
{remap_gpu<uchar>, 0/*remap_gpu<uchar2>*/, remap_gpu<uchar3>, remap_gpu<uchar4>},
|
||||
{0/*remap_gpu<schar>*/, 0/*remap_gpu<char2>*/, 0/*remap_gpu<char3>*/, 0/*remap_gpu<char4>*/},
|
||||
{remap_gpu<ushort>, 0/*remap_gpu<ushort2>*/, remap_gpu<ushort3>, remap_gpu<ushort4>},
|
||||
{remap_gpu<short>, 0/*remap_gpu<short2>*/, remap_gpu<short3>, remap_gpu<short4>},
|
||||
{0/*remap_gpu<int>*/, 0/*remap_gpu<int2>*/, 0/*remap_gpu<int3>*/, 0/*remap_gpu<int4>*/},
|
||||
{remap_gpu<float>, 0/*remap_gpu<float2>*/, remap_gpu<float3>, remap_gpu<float4>}
|
||||
};
|
||||
|
||||
CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
|
||||
CV_Assert(xmap.type() == CV_32F && ymap.type() == CV_32F && xmap.size() == ymap.size());
|
||||
|
||||
caller_t func = callers[src.depth()][src.channels() - 1];
|
||||
CV_Assert(func != 0);
|
||||
|
||||
CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC);
|
||||
|
||||
CV_Assert(borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP);
|
||||
@ -142,7 +143,7 @@ void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const Gp
|
||||
Scalar_<float> borderValueFloat;
|
||||
borderValueFloat = borderValue;
|
||||
|
||||
callers[src.depth()][src.channels() - 1](src, xmap, ymap, dst, interpolation, gpuBorderType, borderValueFloat.val, StreamAccessor::getStream(stream));
|
||||
func(src, xmap, ymap, dst, interpolation, gpuBorderType, borderValueFloat.val, StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
@ -279,19 +280,6 @@ namespace cv { namespace gpu { namespace imgproc
|
||||
|
||||
void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, double fy, int interpolation, Stream& s)
|
||||
{
|
||||
using namespace cv::gpu::imgproc;
|
||||
|
||||
typedef void (*caller_t)(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
|
||||
static const caller_t callers[6][4] =
|
||||
{
|
||||
{resize_gpu<uchar>, resize_gpu<uchar2>, resize_gpu<uchar3>, resize_gpu<uchar4>},
|
||||
{resize_gpu<schar>, resize_gpu<char2>, resize_gpu<char3>, resize_gpu<char4>},
|
||||
{resize_gpu<ushort>, resize_gpu<ushort2>, resize_gpu<ushort3>, resize_gpu<ushort4>},
|
||||
{resize_gpu<short>, resize_gpu<short2>, resize_gpu<short3>, resize_gpu<short4>},
|
||||
{resize_gpu<int>, resize_gpu<int2>, resize_gpu<int3>, resize_gpu<int4>},
|
||||
{resize_gpu<float>, resize_gpu<float2>, resize_gpu<float3>, resize_gpu<float4>}
|
||||
};
|
||||
|
||||
CV_Assert( src.depth() <= CV_32F && src.channels() <= 4 );
|
||||
CV_Assert( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC );
|
||||
CV_Assert( !(dsize == Size()) || (fx > 0 && fy > 0) );
|
||||
@ -352,6 +340,19 @@ void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, doub
|
||||
}
|
||||
else
|
||||
{
|
||||
using namespace cv::gpu::imgproc;
|
||||
|
||||
typedef void (*caller_t)(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
|
||||
static const caller_t callers[6][4] =
|
||||
{
|
||||
{resize_gpu<uchar>, 0/*resize_gpu<uchar2>*/, resize_gpu<uchar3>, resize_gpu<uchar4>},
|
||||
{0/*resize_gpu<schar>*/, 0/*resize_gpu<char2>*/, 0/*resize_gpu<char3>*/, 0/*resize_gpu<char4>*/},
|
||||
{resize_gpu<ushort>, 0/*resize_gpu<ushort2>*/, resize_gpu<ushort3>, resize_gpu<ushort4>},
|
||||
{resize_gpu<short>, 0/*resize_gpu<short2>*/, resize_gpu<short3>, resize_gpu<short4>},
|
||||
{0/*resize_gpu<int>*/, 0/*resize_gpu<int2>*/, 0/*resize_gpu<int3>*/, 0/*resize_gpu<int4>*/},
|
||||
{resize_gpu<float>, 0/*resize_gpu<float2>*/, resize_gpu<float3>, resize_gpu<float4>}
|
||||
};
|
||||
|
||||
callers[src.depth()][src.channels() - 1](src, static_cast<float>(fx), static_cast<float>(fy), dst, interpolation, stream);
|
||||
}
|
||||
}
|
||||
@ -1589,75 +1590,6 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,
|
||||
cufftSafeCall(cufftDestroy(planC2R));
|
||||
}
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////
|
||||
// downsample
|
||||
|
||||
namespace cv { namespace gpu { namespace imgproc
|
||||
{
|
||||
template <typename T, int cn>
|
||||
void downsampleCaller(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
|
||||
}}}
|
||||
|
||||
|
||||
void cv::gpu::downsample(const GpuMat& src, GpuMat& dst, Stream& stream)
|
||||
{
|
||||
CV_Assert(src.depth() < CV_64F && src.channels() <= 4);
|
||||
|
||||
typedef void (*Caller)(const DevMem2D, DevMem2D, cudaStream_t stream);
|
||||
static const Caller callers[6][4] =
|
||||
{{imgproc::downsampleCaller<uchar,1>, imgproc::downsampleCaller<uchar,2>,
|
||||
imgproc::downsampleCaller<uchar,3>, imgproc::downsampleCaller<uchar,4>},
|
||||
{0,0,0,0}, {0,0,0,0},
|
||||
{imgproc::downsampleCaller<short,1>, imgproc::downsampleCaller<short,2>,
|
||||
imgproc::downsampleCaller<short,3>, imgproc::downsampleCaller<short,4>},
|
||||
{0,0,0,0},
|
||||
{imgproc::downsampleCaller<float,1>, imgproc::downsampleCaller<float,2>,
|
||||
imgproc::downsampleCaller<float,3>, imgproc::downsampleCaller<float,4>}};
|
||||
|
||||
Caller caller = callers[src.depth()][src.channels()-1];
|
||||
if (!caller)
|
||||
CV_Error(CV_StsUnsupportedFormat, "bad number of channels");
|
||||
|
||||
dst.create((src.rows + 1) / 2, (src.cols + 1) / 2, src.type());
|
||||
caller(src, dst.reshape(1), StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// upsample
|
||||
|
||||
namespace cv { namespace gpu { namespace imgproc
|
||||
{
|
||||
template <typename T, int cn>
|
||||
void upsampleCaller(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
|
||||
}}}
|
||||
|
||||
|
||||
void cv::gpu::upsample(const GpuMat& src, GpuMat& dst, Stream& stream)
|
||||
{
|
||||
CV_Assert(src.depth() < CV_64F && src.channels() <= 4);
|
||||
|
||||
typedef void (*Caller)(const DevMem2D, DevMem2D, cudaStream_t stream);
|
||||
static const Caller callers[6][5] =
|
||||
{{imgproc::upsampleCaller<uchar,1>, imgproc::upsampleCaller<uchar,2>,
|
||||
imgproc::upsampleCaller<uchar,3>, imgproc::upsampleCaller<uchar,4>},
|
||||
{0,0,0,0}, {0,0,0,0},
|
||||
{imgproc::upsampleCaller<short,1>, imgproc::upsampleCaller<short,2>,
|
||||
imgproc::upsampleCaller<short,3>, imgproc::upsampleCaller<short,4>},
|
||||
{0,0,0,0},
|
||||
{imgproc::upsampleCaller<float,1>, imgproc::upsampleCaller<float,2>,
|
||||
imgproc::upsampleCaller<float,3>, imgproc::upsampleCaller<float,4>}};
|
||||
|
||||
Caller caller = callers[src.depth()][src.channels()-1];
|
||||
if (!caller)
|
||||
CV_Error(CV_StsUnsupportedFormat, "bad number of channels");
|
||||
|
||||
dst.create(src.rows*2, src.cols*2, src.type());
|
||||
caller(src, dst.reshape(1), StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// pyrDown
|
||||
|
||||
|
@ -47,9 +47,6 @@ namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
namespace detail
|
||||
{
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Reduction
|
||||
|
||||
template <int n> struct WarpReductor
|
||||
{
|
||||
template <typename T, typename Op> static __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
|
||||
@ -504,72 +501,6 @@ namespace cv { namespace gpu { namespace device
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Vector Distance
|
||||
|
||||
template <int THREAD_DIM, int N> struct UnrollVecDiffCached
|
||||
{
|
||||
template <typename Dist, typename T1, typename T2>
|
||||
static __device__ void calcCheck(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int ind)
|
||||
{
|
||||
if (ind < len)
|
||||
{
|
||||
T1 val1 = *vecCached++;
|
||||
|
||||
T2 val2;
|
||||
ForceGlob<T2>::Load(vecGlob, ind, val2);
|
||||
|
||||
dist.reduceIter(val1, val2);
|
||||
|
||||
UnrollVecDiffCached<THREAD_DIM, N - 1>::calcCheck(vecCached, vecGlob, len, dist, ind + THREAD_DIM);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Dist, typename T1, typename T2>
|
||||
static __device__ void calcWithoutCheck(const T1* vecCached, const T2* vecGlob, Dist& dist)
|
||||
{
|
||||
T1 val1 = *vecCached++;
|
||||
|
||||
T2 val2;
|
||||
ForceGlob<T2>::Load(vecGlob, 0, val2);
|
||||
vecGlob += THREAD_DIM;
|
||||
|
||||
dist.reduceIter(val1, val2);
|
||||
|
||||
UnrollVecDiffCached<THREAD_DIM, N - 1>::calcWithoutCheck(vecCached, vecGlob, dist);
|
||||
}
|
||||
};
|
||||
template <int THREAD_DIM> struct UnrollVecDiffCached<THREAD_DIM, 0>
|
||||
{
|
||||
template <typename Dist, typename T1, typename T2>
|
||||
static __device__ __forceinline__ void calcCheck(const T1*, const T2*, int, Dist&, int)
|
||||
{
|
||||
}
|
||||
|
||||
template <typename Dist, typename T1, typename T2>
|
||||
static __device__ __forceinline__ void calcWithoutCheck(const T1*, const T2*, Dist&)
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN> struct VecDiffCachedCalculator;
|
||||
template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, false>
|
||||
{
|
||||
template <typename Dist, typename T1, typename T2>
|
||||
static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)
|
||||
{
|
||||
UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcCheck(vecCached, vecGlob, len, dist, tid);
|
||||
}
|
||||
};
|
||||
template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, true>
|
||||
{
|
||||
template <typename Dist, typename T1, typename T2>
|
||||
static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)
|
||||
{
|
||||
UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcWithoutCheck(vecCached, vecGlob + tid, dist);
|
||||
}
|
||||
};
|
||||
}
|
||||
}}}
|
||||
|
||||
|
@ -0,0 +1,117 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef __OPENCV_GPU_VEC_DISTANCE_DETAIL_HPP__
|
||||
#define __OPENCV_GPU_VEC_DISTANCE_DETAIL_HPP__
|
||||
|
||||
#include "../datamov_utils.hpp"
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
namespace detail
|
||||
{
|
||||
template <int THREAD_DIM, int N> struct UnrollVecDiffCached
|
||||
{
|
||||
template <typename Dist, typename T1, typename T2>
|
||||
static __device__ void calcCheck(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int ind)
|
||||
{
|
||||
if (ind < len)
|
||||
{
|
||||
T1 val1 = *vecCached++;
|
||||
|
||||
T2 val2;
|
||||
ForceGlob<T2>::Load(vecGlob, ind, val2);
|
||||
|
||||
dist.reduceIter(val1, val2);
|
||||
|
||||
UnrollVecDiffCached<THREAD_DIM, N - 1>::calcCheck(vecCached, vecGlob, len, dist, ind + THREAD_DIM);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Dist, typename T1, typename T2>
|
||||
static __device__ void calcWithoutCheck(const T1* vecCached, const T2* vecGlob, Dist& dist)
|
||||
{
|
||||
T1 val1 = *vecCached++;
|
||||
|
||||
T2 val2;
|
||||
ForceGlob<T2>::Load(vecGlob, 0, val2);
|
||||
vecGlob += THREAD_DIM;
|
||||
|
||||
dist.reduceIter(val1, val2);
|
||||
|
||||
UnrollVecDiffCached<THREAD_DIM, N - 1>::calcWithoutCheck(vecCached, vecGlob, dist);
|
||||
}
|
||||
};
|
||||
template <int THREAD_DIM> struct UnrollVecDiffCached<THREAD_DIM, 0>
|
||||
{
|
||||
template <typename Dist, typename T1, typename T2>
|
||||
static __device__ __forceinline__ void calcCheck(const T1*, const T2*, int, Dist&, int)
|
||||
{
|
||||
}
|
||||
|
||||
template <typename Dist, typename T1, typename T2>
|
||||
static __device__ __forceinline__ void calcWithoutCheck(const T1*, const T2*, Dist&)
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN> struct VecDiffCachedCalculator;
|
||||
template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, false>
|
||||
{
|
||||
template <typename Dist, typename T1, typename T2>
|
||||
static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)
|
||||
{
|
||||
UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcCheck(vecCached, vecGlob, len, dist, tid);
|
||||
}
|
||||
};
|
||||
template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, true>
|
||||
{
|
||||
template <typename Dist, typename T1, typename T2>
|
||||
static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)
|
||||
{
|
||||
UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcWithoutCheck(vecCached, vecGlob + tid, dist);
|
||||
}
|
||||
};
|
||||
}
|
||||
}}}
|
||||
|
||||
#endif // __OPENCV_GPU_VEC_DISTANCE_DETAIL_HPP__
|
@ -46,7 +46,6 @@
|
||||
#include "internal_shared.hpp"
|
||||
#include "saturate_cast.hpp"
|
||||
#include "datamov_utils.hpp"
|
||||
#include "functional.hpp"
|
||||
#include "detail/utility_detail.hpp"
|
||||
|
||||
#define OPENCV_GPU_LOG_WARP_SIZE (5)
|
||||
|
@ -43,7 +43,10 @@
|
||||
#ifndef __OPENCV_GPU_VEC_DISTANCE_HPP__
|
||||
#define __OPENCV_GPU_VEC_DISTANCE_HPP__
|
||||
|
||||
#include "internal_shared.hpp"
|
||||
#include "utility.hpp"
|
||||
#include "functional.hpp"
|
||||
#include "detail/vec_distance_detail.hpp"
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user